Cost Optimization
AWS costs can spiral out of control quickly without proper governance and optimization strategies. Terraform helps by making cost controls repeatable and enforceable, but you need to understand AWS pricing models, implement proper tagging strategies, and automate resource lifecycle management to keep costs under control.
This part covers the patterns and practices for implementing cost optimization with Terraform, from basic tagging strategies to advanced automation that right-sizes resources and manages their lifecycle.
Comprehensive Tagging Strategy
Consistent tagging is the foundation of cost management and allocation:
# Global tagging strategy
locals {
# Required tags for all resources
required_tags = {
Environment = var.environment
Project = var.project_name
Owner = var.team_name
CostCenter = var.cost_center
ManagedBy = "terraform"
CreatedDate = formatdate("YYYY-MM-DD", timestamp())
}
# Optional tags that can be merged
optional_tags = {
Application = var.application_name
Component = var.component_name
Version = var.application_version
}
# Combined tags
common_tags = merge(local.required_tags, local.optional_tags)
}
# Provider-level default tags
provider "aws" {
region = var.aws_region
default_tags {
tags = local.required_tags
}
}
# Resource-specific tagging
resource "aws_instance" "web" {
ami = data.aws_ami.amazon_linux.id
instance_type = var.instance_type
tags = merge(local.common_tags, {
Name = "${var.name_prefix}-web-${count.index + 1}"
Role = "webserver"
Backup = "daily"
AutoShutdown = var.environment != "production" ? "true" : "false"
})
}
# Enforce tagging with lifecycle rules
resource "aws_instance" "web" {
# ... other configuration ...
lifecycle {
postcondition {
condition = alltrue([
for tag in keys(local.required_tags) :
contains(keys(self.tags), tag)
])
error_message = "All required tags must be present: ${join(", ", keys(local.required_tags))}"
}
}
}
Resource Right-Sizing
Implement policies to prevent oversized resources:
# Instance type validation
variable "allowed_instance_types" {
description = "Allowed EC2 instance types by environment"
type = map(list(string))
default = {
dev = [
"t3.nano", "t3.micro", "t3.small", "t3.medium"
]
staging = [
"t3.small", "t3.medium", "t3.large",
"m5.large", "m5.xlarge"
]
production = [
"t3.medium", "t3.large", "t3.xlarge",
"m5.large", "m5.xlarge", "m5.2xlarge",
"c5.large", "c5.xlarge", "c5.2xlarge"
]
}
}
resource "aws_instance" "web" {
ami = data.aws_ami.amazon_linux.id
instance_type = var.instance_type
lifecycle {
precondition {
condition = contains(
var.allowed_instance_types[var.environment],
var.instance_type
)
error_message = "Instance type ${var.instance_type} is not allowed in ${var.environment} environment. Allowed types: ${join(", ", var.allowed_instance_types[var.environment])}"
}
}
}
# RDS instance size controls
variable "allowed_db_instance_classes" {
description = "Allowed RDS instance classes by environment"
type = map(list(string))
default = {
dev = [
"db.t3.micro", "db.t3.small"
]
staging = [
"db.t3.small", "db.t3.medium", "db.r5.large"
]
production = [
"db.t3.medium", "db.t3.large",
"db.r5.large", "db.r5.xlarge", "db.r5.2xlarge"
]
}
}
resource "aws_db_instance" "main" {
identifier = "${var.name_prefix}-database"
engine = "mysql"
engine_version = "8.0"
instance_class = var.db_instance_class
lifecycle {
precondition {
condition = contains(
var.allowed_db_instance_classes[var.environment],
var.db_instance_class
)
error_message = "DB instance class ${var.db_instance_class} is not allowed in ${var.environment} environment."
}
}
}
Automated Resource Scheduling
Implement automated start/stop for non-production resources:
# Lambda function for EC2 scheduling
resource "aws_lambda_function" "ec2_scheduler" {
filename = "ec2_scheduler.zip"
function_name = "${var.name_prefix}-ec2-scheduler"
role = aws_iam_role.ec2_scheduler.arn
handler = "index.handler"
runtime = "python3.9"
timeout = 60
environment {
variables = {
ENVIRONMENT = var.environment
}
}
tags = local.common_tags
}
# IAM role for scheduler
resource "aws_iam_role" "ec2_scheduler" {
name = "${var.name_prefix}-ec2-scheduler-role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "lambda.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy" "ec2_scheduler" {
name = "${var.name_prefix}-ec2-scheduler-policy"
role = aws_iam_role.ec2_scheduler.id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
]
Resource = "arn:aws:logs:*:*:*"
},
{
Effect = "Allow"
Action = [
"ec2:DescribeInstances",
"ec2:StartInstances",
"ec2:StopInstances"
]
Resource = "*"
}
]
})
}
# CloudWatch Events for scheduling
resource "aws_cloudwatch_event_rule" "stop_instances" {
name = "${var.name_prefix}-stop-instances"
description = "Stop non-production instances at 6 PM"
schedule_expression = "cron(0 18 ? * MON-FRI *)"
tags = local.common_tags
}
resource "aws_cloudwatch_event_rule" "start_instances" {
name = "${var.name_prefix}-start-instances"
description = "Start non-production instances at 8 AM"
schedule_expression = "cron(0 8 ? * MON-FRI *)"
tags = local.common_tags
}
resource "aws_cloudwatch_event_target" "stop_instances" {
rule = aws_cloudwatch_event_rule.stop_instances.name
target_id = "StopInstancesTarget"
arn = aws_lambda_function.ec2_scheduler.arn
input = jsonencode({
action = "stop"
})
}
resource "aws_cloudwatch_event_target" "start_instances" {
rule = aws_cloudwatch_event_rule.start_instances.name
target_id = "StartInstancesTarget"
arn = aws_lambda_function.ec2_scheduler.arn
input = jsonencode({
action = "start"
})
}
resource "aws_lambda_permission" "allow_cloudwatch_stop" {
statement_id = "AllowExecutionFromCloudWatchStop"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.ec2_scheduler.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.stop_instances.arn
}
resource "aws_lambda_permission" "allow_cloudwatch_start" {
statement_id = "AllowExecutionFromCloudWatchStart"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.ec2_scheduler.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.start_instances.arn
}
Storage Lifecycle Management
Implement intelligent tiering and lifecycle policies:
# S3 bucket with intelligent tiering
resource "aws_s3_bucket" "data_storage" {
bucket = "${var.name_prefix}-data-storage"
tags = local.common_tags
}
resource "aws_s3_bucket_intelligent_tiering_configuration" "data_storage" {
bucket = aws_s3_bucket.data_storage.id
name = "EntireBucket"
status = "Enabled"
filter {
prefix = ""
}
tiering {
access_tier = "DEEP_ARCHIVE_ACCESS"
days = 180
}
tiering {
access_tier = "ARCHIVE_ACCESS"
days = 125
}
}
# Lifecycle configuration for different data types
resource "aws_s3_bucket_lifecycle_configuration" "data_storage" {
bucket = aws_s3_bucket.data_storage.id
rule {
id = "logs_lifecycle"
status = "Enabled"
filter {
prefix = "logs/"
}
transition {
days = 30
storage_class = "STANDARD_IA"
}
transition {
days = 90
storage_class = "GLACIER"
}
transition {
days = 365
storage_class = "DEEP_ARCHIVE"
}
expiration {
days = 2555 # 7 years
}
}
rule {
id = "temp_data_cleanup"
status = "Enabled"
filter {
prefix = "temp/"
}
expiration {
days = 7
}
}
rule {
id = "incomplete_multipart_uploads"
status = "Enabled"
abort_incomplete_multipart_upload {
days_after_initiation = 1
}
}
}
# EBS volume optimization
resource "aws_ebs_volume" "data" {
availability_zone = var.availability_zone
size = var.volume_size
type = var.environment == "production" ? "gp3" : "gp2"
encrypted = true
# Use gp3 for better cost/performance in production
dynamic "throughput" {
for_each = var.environment == "production" ? [1] : []
content {
throughput = 125 # Baseline throughput for gp3
}
}
dynamic "iops" {
for_each = var.environment == "production" ? [1] : []
content {
iops = 3000 # Baseline IOPS for gp3
}
}
tags = merge(local.common_tags, {
Name = "${var.name_prefix}-data-volume"
Type = "data"
})
}
Reserved Instance and Savings Plans Management
Track and manage reserved capacity:
# Data source to check existing reserved instances
data "aws_ec2_reserved_instances" "existing" {
filter {
name = "state"
values = ["active"]
}
}
# Local calculation for RI coverage
locals {
# Calculate running instances by type
running_instances = {
for instance_type, count in var.instance_counts :
instance_type => count
}
# Calculate RI coverage
ri_coverage = {
for ri in data.aws_ec2_reserved_instances.existing.reserved_instances :
ri.instance_type => ri.instance_count
}
# Identify gaps in RI coverage
ri_gaps = {
for instance_type, running_count in local.running_instances :
instance_type => max(0, running_count - lookup(local.ri_coverage, instance_type, 0))
}
}
# Output RI recommendations
output "ri_recommendations" {
description = "Reserved Instance purchase recommendations"
value = {
for instance_type, gap in local.ri_gaps :
instance_type => {
running_instances = local.running_instances[instance_type]
reserved_instances = lookup(local.ri_coverage, instance_type, 0)
recommended_purchase = gap
}
if gap > 0
}
}
Cost Monitoring and Alerting
Set up comprehensive cost monitoring:
# Budget for overall account spending
resource "aws_budgets_budget" "monthly_budget" {
name = "${var.name_prefix}-monthly-budget"
budget_type = "COST"
limit_amount = var.monthly_budget_limit
limit_unit = "USD"
time_unit = "MONTHLY"
cost_filters = {
LinkedAccount = [data.aws_caller_identity.current.account_id]
}
notification {
comparison_operator = "GREATER_THAN"
threshold = 80
threshold_type = "PERCENTAGE"
notification_type = "ACTUAL"
subscriber_email_addresses = var.budget_notification_emails
}
notification {
comparison_operator = "GREATER_THAN"
threshold = 100
threshold_type = "PERCENTAGE"
notification_type = "FORECASTED"
subscriber_email_addresses = var.budget_notification_emails
}
}
# Service-specific budgets
resource "aws_budgets_budget" "service_budgets" {
for_each = var.service_budgets
name = "${var.name_prefix}-${each.key}-budget"
budget_type = "COST"
limit_amount = each.value.limit
limit_unit = "USD"
time_unit = "MONTHLY"
cost_filters = {
Service = [each.value.service_name]
}
notification {
comparison_operator = "GREATER_THAN"
threshold = each.value.threshold
threshold_type = "PERCENTAGE"
notification_type = "ACTUAL"
subscriber_email_addresses = var.budget_notification_emails
}
}
# Cost anomaly detection
resource "aws_ce_anomaly_detector" "service_anomaly" {
name = "${var.name_prefix}-service-anomaly-detector"
monitor_type = "DIMENSIONAL"
specification = jsonencode({
Dimension = "SERVICE"
MatchOptions = ["EQUALS"]
Values = ["Amazon Elastic Compute Cloud - Compute", "Amazon Relational Database Service"]
})
tags = local.common_tags
}
resource "aws_ce_anomaly_subscription" "service_anomaly" {
name = "${var.name_prefix}-anomaly-subscription"
frequency = "DAILY"
monitor_arn_list = [
aws_ce_anomaly_detector.service_anomaly.arn
]
subscriber {
type = "EMAIL"
address = var.cost_anomaly_email
}
threshold_expression {
and {
dimension {
key = "ANOMALY_TOTAL_IMPACT_ABSOLUTE"
values = ["50"]
match_options = ["GREATER_THAN_OR_EQUAL"]
}
}
}
tags = local.common_tags
}
Spot Instance Integration
Use Spot instances for cost-effective compute:
# Launch template for Spot instances
resource "aws_launch_template" "spot_template" {
name_prefix = "${var.name_prefix}-spot-"
image_id = data.aws_ami.amazon_linux.id
instance_type = var.spot_instance_type
vpc_security_group_ids = [aws_security_group.web.id]
iam_instance_profile {
name = aws_iam_instance_profile.app_server.name
}
user_data = base64encode(templatefile("${path.module}/user_data.sh", {
environment = var.environment
}))
tag_specifications {
resource_type = "instance"
tags = merge(local.common_tags, {
Name = "${var.name_prefix}-spot-instance"
Type = "spot"
})
}
}
# Auto Scaling Group with mixed instances
resource "aws_autoscaling_group" "mixed_instances" {
name = "${var.name_prefix}-mixed-asg"
vpc_zone_identifier = var.private_subnet_ids
target_group_arns = [aws_lb_target_group.web.arn]
health_check_type = "ELB"
min_size = var.min_size
max_size = var.max_size
desired_capacity = var.desired_capacity
mixed_instances_policy {
launch_template {
launch_template_specification {
launch_template_id = aws_launch_template.spot_template.id
version = "$Latest"
}
override {
instance_type = "t3.medium"
weighted_capacity = "1"
}
override {
instance_type = "t3.large"
weighted_capacity = "2"
}
}
instances_distribution {
on_demand_base_capacity = 1
on_demand_percentage_above_base_capacity = 25
spot_allocation_strategy = "capacity-optimized"
spot_instance_pools = 2
spot_max_price = var.spot_max_price
}
}
tag {
key = "Name"
value = "${var.name_prefix}-mixed-asg"
propagate_at_launch = true
}
dynamic "tag" {
for_each = local.common_tags
content {
key = tag.key
value = tag.value
propagate_at_launch = true
}
}
}
Resource Cleanup Automation
Automate cleanup of unused resources:
# Lambda function for resource cleanup
resource "aws_lambda_function" "resource_cleanup" {
filename = "resource_cleanup.zip"
function_name = "${var.name_prefix}-resource-cleanup"
role = aws_iam_role.resource_cleanup.arn
handler = "index.handler"
runtime = "python3.9"
timeout = 300
environment {
variables = {
ENVIRONMENT = var.environment
DRY_RUN = var.cleanup_dry_run
}
}
tags = local.common_tags
}
# Schedule cleanup to run weekly
resource "aws_cloudwatch_event_rule" "resource_cleanup" {
name = "${var.name_prefix}-resource-cleanup"
description = "Weekly resource cleanup"
schedule_expression = "cron(0 2 ? * SUN *)" # 2 AM every Sunday
tags = local.common_tags
}
resource "aws_cloudwatch_event_target" "resource_cleanup" {
rule = aws_cloudwatch_event_rule.resource_cleanup.name
target_id = "ResourceCleanupTarget"
arn = aws_lambda_function.resource_cleanup.arn
}
resource "aws_lambda_permission" "allow_cloudwatch_cleanup" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.resource_cleanup.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.resource_cleanup.arn
}
What’s Next
Cost optimization provides the financial discipline needed for sustainable AWS operations, but implementing reusable patterns and modules is what makes these optimizations scalable across your organization.
In the next part, we’ll explore AWS-specific module patterns that encapsulate these cost optimization strategies along with security and operational best practices, creating reusable building blocks for your infrastructure.