The Situation
Monitoring automation — new EC2 instances should be observable from day one without manual alarm setup in the CloudWatch console.
Problem Statement
Your team gets paged at 3 AM when an EC2 instance runs out of disk space — but only after the application has already crashed. Without proactive alarms, you’re reactive. This script creates CPU, memory, disk, and status-check alarms for every running instance in under 30 seconds.
Two Metric Namespaces
| Namespace | Source | Metrics |
|---|
AWS/EC2 | Built-in (no agent needed) | CPUUtilization, StatusCheckFailed, NetworkIn/Out |
CWAgent | Requires CloudWatch Agent on instance | mem_used_percent, disk_used_percent |
Memory and disk are not natively reported by AWS — you must install the CloudWatch Agent on each instance.
Complete Script
import boto3
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
def setup_ec2_alarms(sns_topic_arn: str, region: str = "us-east-1") -> int:
"""
Creates 4 alarms per running EC2 instance:
1. High CPU (AWS/EC2 namespace — no agent needed)
2. Status Check Failed (AWS/EC2 — built-in)
3. High Memory (CWAgent — requires CloudWatch Agent)
4. Low Disk Space (CWAgent — requires CloudWatch Agent)
Returns the total number of alarms created.
"""
ec2 = boto3.client("ec2", region_name=region)
cw = boto3.client("cloudwatch", region_name=region)
# ── Discover all running instances (paginated) ─────────────────
instances = []
paginator = ec2.get_paginator("describe_instances")
for page in paginator.paginate(
Filters=[{"Name": "instance-state-name", "Values": ["running"]}]
):
for r in page["Reservations"]:
instances.extend(r["Instances"])
logger.info(f"Creating alarms for {len(instances)} running instances...")
alarms_created = 0
for instance in instances:
iid = instance["InstanceId"]
name = next(
(t["Value"] for t in instance.get("Tags", []) if t["Key"] == "Name"),
iid,
)
# ── Alarm configuration list ───────────────────────────────
# Each dict maps to one put_metric_alarm() call.
# AWS/EC2 metrics use only InstanceId dimension.
# CWAgent metrics additionally need InstanceId for linkage.
alarm_configs = [
# ── 1. CPU Utilization ─────────────────────────────────
# Period=300 (5 min) × EvaluationPeriods=3 = 15-min sustained breach
# Sustained check avoids false alarms from brief CPU spikes.
{
"name": f"{name}-HighCPU",
"metric": "CPUUtilization",
"namespace": "AWS/EC2",
"threshold": 80.0,
"comparison": "GreaterThanThreshold",
"period": 300,
"eval_periods": 3,
"description": f"CPU > 80% for 15 min on {name}",
"dimensions": [{"Name": "InstanceId", "Value": iid}],
},
# ── 2. Status Check ────────────────────────────────────
# StatusCheckFailed = 1 if EITHER system or instance check fails.
# Period=60 (1 min) × EvaluationPeriods=2 = 2-min sustained failure.
# Short period because a failed status check is serious immediately.
{
"name": f"{name}-StatusCheckFailed",
"metric": "StatusCheckFailed",
"namespace": "AWS/EC2",
"threshold": 0,
"comparison": "GreaterThanThreshold",
"period": 60,
"eval_periods": 2,
"description": f"Status check failed on {name}",
"dimensions": [{"Name": "InstanceId", "Value": iid}],
},
# ── 3. Memory (requires CloudWatch Agent) ─────────────
# mem_used_percent is published by the CWAgent, not AWS.
# Namespace = "CWAgent" (not "AWS/EC2").
# The CWAgent config on the instance must be publishing this metric.
{
"name": f"{name}-HighMemory",
"metric": "mem_used_percent",
"namespace": "CWAgent",
"threshold": 85.0,
"comparison": "GreaterThanThreshold",
"period": 300,
"eval_periods": 3,
"description": f"Memory > 85% on {name}",
"dimensions": [{"Name": "InstanceId", "Value": iid}],
},
# ── 4. Disk Space (requires CloudWatch Agent) ──────────
# disk_used_percent needs extra dimensions to identify WHICH
# disk: "path" ("/") and "fstype" ("xfs" or "ext4").
# These must match exactly what the CWAgent is reporting.
{
"name": f"{name}-LowDiskSpace",
"metric": "disk_used_percent",
"namespace": "CWAgent",
"threshold": 85.0,
"comparison": "GreaterThanThreshold",
"period": 300,
"eval_periods": 2,
"description": f"Disk > 85% on {name} /",
"dimensions": [
{"Name": "InstanceId", "Value": iid},
{"Name": "path", "Value": "/"},
{"Name": "fstype", "Value": "xfs"}, # or "ext4"
{"Name": "device", "Value": "nvme0n1p1"},
],
},
]
# ── Create each alarm ──────────────────────────────────────
for cfg in alarm_configs:
try:
cw.put_metric_alarm(
AlarmName=cfg["name"],
AlarmDescription=cfg["description"],
MetricName=cfg["metric"],
Namespace=cfg["namespace"],
Statistic="Average",
# Dimensions uniquely identify the resource to monitor.
# AWS/EC2 only needs InstanceId; CWAgent metrics need more.
Dimensions=cfg["dimensions"],
# Period: seconds per data point (60, 300, 900...)
# Lower period = more granular but more CloudWatch costs.
Period=cfg["period"],
# EvaluationPeriods: how many consecutive data points must
# breach the threshold before the alarm fires.
EvaluationPeriods=cfg["eval_periods"],
Threshold=cfg["threshold"],
ComparisonOperator=cfg["comparison"],
# TreatMissingData:
# "notBreaching" — missing data doesn't trigger alarm
# "breaching" — missing data triggers alarm (good for uptime)
# "ignore" — keeps previous alarm state
TreatMissingData="notBreaching",
# AlarmActions: SNS topics (or other targets) to notify
# when the alarm goes into ALARM state.
AlarmActions=[sns_topic_arn],
# OKActions: notify when alarm recovers to OK state.
OKActions=[sns_topic_arn],
)
alarms_created += 1
logger.info(f" Created: {cfg['name']}")
except Exception as e:
logger.error(f" Failed to create alarm {cfg['name']}: {e}")
logger.info(f"Done. Created {alarms_created} alarms for {len(instances)} instances.")
return alarms_created
def delete_alarms_for_terminated_instances(region: str = "us-east-1") -> int:
"""
Clean up stale alarms for instances that no longer exist.
describe_alarms() returns all alarms; we check if their InstanceId
dimension matches a running or stopped instance.
"""
ec2 = boto3.client("ec2", region_name=region)
cw = boto3.client("cloudwatch", region_name=region)
# Get all existing instance IDs (any state)
existing_ids = set()
paginator = ec2.get_paginator("describe_instances")
for page in paginator.paginate():
for r in page["Reservations"]:
for inst in r["Instances"]:
existing_ids.add(inst["InstanceId"])
# Get all CloudWatch alarms
alarms_to_delete = []
cw_paginator = cw.get_paginator("describe_alarms")
for page in cw_paginator.paginate(AlarmTypes=["MetricAlarm"]):
for alarm in page["MetricAlarms"]:
# Extract InstanceId from alarm dimensions
instance_id = next(
(d["Value"] for d in alarm.get("Dimensions", [])
if d["Name"] == "InstanceId"),
None,
)
if instance_id and instance_id not in existing_ids:
alarms_to_delete.append(alarm["AlarmName"])
if alarms_to_delete:
# delete_alarms() accepts up to 100 names at a time
for i in range(0, len(alarms_to_delete), 100):
cw.delete_alarms(AlarmNames=alarms_to_delete[i:i+100])
logger.info(f"Deleted {len(alarms_to_delete)} stale alarm(s)")
return len(alarms_to_delete)
if __name__ == "__main__":
total = setup_ec2_alarms(
sns_topic_arn="arn:aws:sns:us-east-1:123456789012:ops-alerts",
region="ap-south-1",
)
print(f"\nTotal alarms created: {total}")
# Run cleanup weekly to remove alarms for terminated instances
# deleted = delete_alarms_for_terminated_instances(region="ap-south-1")
Key Commands Explained
| Command | What it does |
|---|
cw.put_metric_alarm(...) | Creates or updates (idempotent) a CloudWatch alarm |
Namespace="AWS/EC2" | Built-in EC2 metrics — no agent required |
Namespace="CWAgent" | Custom metrics from the CloudWatch Agent |
Period=300 | 5-minute data point interval — standard for EC2 metrics |
EvaluationPeriods=3 | Alarm fires only after 3 consecutive breaches (avoids false alarms) |
TreatMissingData="notBreaching" | Missing data points don’t trigger the alarm |
AlarmActions=[sns_topic_arn] | SNS topic to notify when alarm state changes to ALARM |
OKActions=[sns_topic_arn] | SNS topic to notify when alarm recovers to OK |
delete_alarms(AlarmNames=[...]) | Deletes up to 100 alarms per call |
CloudWatch Agent Installation (Quick Reference)
# On Amazon Linux 2 / 2023
sudo yum install -y amazon-cloudwatch-agent
# Minimal config to publish mem + disk
cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<'EOF'
{
"metrics": {
"metrics_collected": {
"mem": { "measurement": ["mem_used_percent"] },
"disk": { "measurement": ["disk_used_percent"], "resources": ["/"] }
}
}
}
EOF
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \
-a fetch-config -m ec2 \
-c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s
---
## 🔍 Line-by-Line Code Walkthrough
### Imports
| Line | Why It's Used |
|------|--------------|
| `import boto3` | AWS SDK — needed for EC2 and CloudWatch clients |
| `import logging` | Structured log output with timestamps |
---
### `setup_ec2_alarms(sns_topic_arn, region)`
```python
ec2 = boto3.client("ec2", region_name=region)
cw = boto3.client("cloudwatch", region_name=region)
| Line | Explanation |
|---|
boto3.client("ec2", region_name=region) | EC2 client to list running instances |
boto3.client("cloudwatch", region_name=region) | CloudWatch client to create alarms. Alarms are regional — they must be in the same region as the EC2 instances they monitor |
paginator = ec2.get_paginator("describe_instances")
for page in paginator.paginate(
Filters=[{"Name": "instance-state-name", "Values": ["running"]}]
):
for r in page["Reservations"]:
instances.extend(r["Instances"])
| Line | Explanation |
|---|
get_paginator("describe_instances") | Handles pagination automatically. Without this, you’d miss instances if there are more than 1000 |
Filters=[{"Name":"instance-state-name","Values":["running"]}] | Only returns running instances. No point creating alarms for stopped or terminated instances |
instances.extend(r["Instances"]) | Flattens Reservations → Instances into one list |
Alarm Config Dictionary
{
"name": f"{name}-HighCPU",
"metric": "CPUUtilization",
"namespace": "AWS/EC2",
"threshold": 80.0,
"comparison": "GreaterThanThreshold",
"period": 300,
"eval_periods": 3,
"dimensions": [{"Name": "InstanceId", "Value": iid}],
}
| Field | Explanation |
|---|
"name" | The CloudWatch alarm name. Must be unique per account+region. We prefix with the instance Name tag for readability |
"metric": "CPUUtilization" | The CloudWatch metric name to monitor. Case-sensitive |
"namespace": "AWS/EC2" | EC2 built-in metrics live in the AWS/EC2 namespace — no agent required |
"namespace": "CWAgent" | Memory and disk metrics published by the CloudWatch Agent live in the CWAgent namespace |
"threshold": 80.0 | The breach level. For "GreaterThanThreshold", alarm fires when the metric exceeds 80 |
"comparison": "GreaterThanThreshold" | The comparison operator. Other options: GreaterThanOrEqualToThreshold, LessThanThreshold, LessThanOrEqualToThreshold |
"period": 300 | Data point interval in seconds. 300 = 5-minute data points. CloudWatch stores metrics at 1-min or 5-min resolution depending on detailed monitoring |
"eval_periods": 3 | Number of consecutive data points that must breach the threshold before the alarm fires. 3 periods × 300 seconds = 15 minutes sustained breach before alert |
"dimensions" | Identifies WHICH resource to monitor. InstanceId is the primary dimension for EC2 metrics |
"dimensions": [
{"Name": "InstanceId", "Value": iid},
{"Name": "path", "Value": "/"},
{"Name": "fstype", "Value": "xfs"},
{"Name": "device", "Value": "nvme0n1p1"},
],
| Field | Explanation |
|---|
"path": "/" | The mount point of the disk to monitor. Must exactly match what the CWAgent reports (check CloudWatch Metrics → CWAgent for actual values) |
"fstype": "xfs" | Filesystem type. Amazon Linux 2 uses xfs; Ubuntu uses ext4. Wrong value means the alarm never fires because no metrics match |
"device": "nvme0n1p1" | The disk device name. NVMe instances use nvme0n1p1; older instances use xvda1. Must match CWAgent output exactly |
| Why so many dimensions? | CWAgent reports metrics per disk mount. Without these dimensions, CloudWatch can’t identify which disk’s disk_used_percent to monitor |
cw.put_metric_alarm(...)
cw.put_metric_alarm(
AlarmName=cfg["name"],
AlarmDescription=cfg["description"],
MetricName=cfg["metric"],
Namespace=cfg["namespace"],
Statistic="Average",
Dimensions=cfg["dimensions"],
Period=cfg["period"],
EvaluationPeriods=cfg["eval_periods"],
Threshold=cfg["threshold"],
ComparisonOperator=cfg["comparison"],
TreatMissingData="notBreaching",
AlarmActions=[sns_topic_arn],
OKActions=[sns_topic_arn],
)
| Parameter | Explanation |
|---|
AlarmName | Unique identifier. If an alarm with this name already exists, put_metric_alarm updates it — making it idempotent |
Statistic="Average" | How to aggregate data points within each period. "Average" is standard for CPU and memory. "Sum" for counting events. "Maximum" for peak-sensitive metrics |
TreatMissingData="notBreaching" | What to do if no data arrives in a period. "notBreaching" = keep OK state (instance may just not be reporting yet). Use "breaching" for uptime alarms where missing data = instance is down |
AlarmActions=[sns_topic_arn] | List of ARNs to trigger when alarm state changes to ALARM. SNS topics are the most common target — they can fan out to email, PagerDuty, Slack, Lambda |
OKActions=[sns_topic_arn] | Triggered when the alarm recovers from ALARM → OK. Sends a “resolved” notification to the same SNS topic |
delete_alarms_for_terminated_instances()
cw_paginator = cw.get_paginator("describe_alarms")
for page in cw_paginator.paginate(AlarmTypes=["MetricAlarm"]):
for alarm in page["MetricAlarms"]:
instance_id = next(
(d["Value"] for d in alarm.get("Dimensions", []) if d["Name"] == "InstanceId"),
None,
)
if instance_id and instance_id not in existing_ids:
alarms_to_delete.append(alarm["AlarmName"])
| Line | Explanation |
|---|
AlarmTypes=["MetricAlarm"] | Filters to metric alarms only (vs composite alarms). Required parameter for the paginator |
alarm.get("Dimensions", []) | Gets the alarm’s dimension list. Alarms for non-EC2 metrics may have no dimensions |
d["Name"] == "InstanceId" | Searches for the InstanceId dimension to identify which EC2 instance this alarm belongs to |
instance_id not in existing_ids | If the InstanceId doesn’t match any current EC2 instance (including stopped/terminated), the alarm is stale |
for i in range(0, len(alarms_to_delete), 100):
cw.delete_alarms(AlarmNames=alarms_to_delete[i:i+100])
| Line | Explanation |
|---|
range(0, len(alarms_to_delete), 100) | Generates indices [0, 100, 200, ...] — batch slicing loop |
alarms_to_delete[i:i+100] | Slice of 100 alarm names. delete_alarms accepts a maximum of 100 alarm names per call |
| Why batch? | With many instances, you could have thousands of stale alarms. Batching avoids hitting the 100-name limit per API call |