AWS CloudWatch监控之钉钉告警

pivoteic
发布于 2023-7-12 17:32
浏览
0收藏

AWS CloudWatch监控之钉钉告警-鸿蒙开发者社区

一、CloudWatch服务安装

Amazon Linux 2系统安装Agent

AWS CloudWatch监控之钉钉告警-鸿蒙开发者社区

Bash
#!/bin/bash
rpm  -ivh https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm

sudo tee -a /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json   <<-'EOF'
{
        "logs": {
                "logs_collected": {
                        "files": {
                                "collect_list": [
                                        {
                                                "file_path": "/logArchive/hcaextension/info*.log",
                                                "log_group_name": "RGC-Prod-3in1oven",
                                                "log_stream_name": "info.logs"
                                        },
                                        {
                                                "file_path": "/logArchive/hcaextension/http*.log",
                                                "log_group_name": "RGC-Prod-3in1oven",
                                                "log_stream_name": "http.logs"
                                        }
                                ]
                        }
                }
        },
        "metrics": {
                "aggregation_dimensions": [
                        [
                                "InstanceId"
                        ]
                ],
                "append_dimensions": {
                        "AutoScalingGroupName": "${aws:AutoScalingGroupName}",
                        "ImageId": "${aws:ImageId}",
                        "InstanceId": "${aws:InstanceId}",
                        "InstanceType": "${aws:InstanceType}"
                },
                "metrics_collected": {
                        "cpu": {
                                "measurement": [
                                        "cpu_usage_idle",
                                        "cpu_usage_iowait",
                                        "cpu_usage_user",
                                        "cpu_usage_system"
                                ],
                                "metrics_collection_interval": 180,
                                "resources": [
                                        "*"
                                ],
                                "totalcpu": false
                        },
                        "disk": {
                                "measurement": [
                                        "used_percent"
                                ],
                                "metrics_collection_interval": 180,
                                "resources": [
                                        "/"
                                ]
                        },
                        "diskio": {
                                "measurement": [
                                        "io_time",
                                        "write_bytes",
                                        "read_bytes",
                                        "writes",
                                        "reads"
                                ],
                                "metrics_collection_interval": 180,
                                "resources": [
                                        "/"
                                ]
                        },
                        "mem": {
                                "measurement": [
                                        "mem_used_percent"
                                ],
                                "metrics_collection_interval": 180
                        },
                        "netstat": {
                                "measurement": [
                                        "tcp_established",
                                        "tcp_time_wait"
                                ],
                                "metrics_collection_interval": 180
                        },
                        "statsd": {
                                "metrics_aggregation_interval": 60,
                                "metrics_collection_interval": 180,
                                "service_address": ":8125"
                        },
                        "swap": {
                                "measurement": [
                                        "swap_used_percent"
                                ],
                                "metrics_collection_interval": 180
                        }
                }
        }
}

EOF

sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json

systemctl restart amazon-cloudwatch-agent.service
systemctl enable amazon-cloudwatch-agent.service

二、AWS-CLI批量下发监控

前提条件:本机安装awscli工具

需要修改的是区域信息、ip_list、实例id、sns_arn信息

通过脚本自动在CloudWatch上添加监控配置EC2监控

Python
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import json
import subprocess

# 1. 配置cli路径和region
Contants = {
    "AWSCLI": '"C:\\Program Files\\Amazon\\AWSCLI\\bin\\aws.exe" --output json',
    "AWSREGION": ['eu-central-1']  # 新加坡
}


# 构造字典
class CreateDict(dict):
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value


#########################################################################################################
# 配置告警

# CPUUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getCPUUtilizationComm(name, action, instance_id):
    mertic = 'CPUUtilization'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--unit Percent \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# MEMUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getmem_used_percentComm(name, action, instance_id, instancetype, imageid):
    mertic = 'mem_used_percent'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype}'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)


# DISKUtilization,3分钟检查3次,平均值大于或等于80%,就告警。
def getdisk_used_percentComm(name, action, instance_id, instancetype, imageid):
    mertic = 'disk_used_percent'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace CWAgent \
--dimensions "Name=path,Value=/" \
--statistic Average \
--period 60 \
--threshold 80 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data missing \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions Name=InstanceId,Value={id} Name=ImageId,Value={imageid} Name=InstanceType,Value={instancetype} Name=device,Value=nvme0n1p1 Name=fstype,Value=ext4 "Name=path,Value=/"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic,instancetype=instancetype, imageid=imageid)
#注意因为磁盘无法获取到值和指定变量所以磁盘的值需要在cloudwatch上看下类型值来填写  device和fstype


# NetworkIn,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkInComm(name, action, instance_id):
    mertic = 'NetworkIn'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value=%s"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# NetworkOut,3分钟检查3次,平均值大于或等于5m,就告警。
def getNetworkOutComm(name, action, instance_id):
    mertic = 'NetworkOut'
    print("#####开始配置 %s#####" % mertic)
    return '''{cli} cloudwatch put-metric-alarm \
--alarm-name "AWS_EC2_{name}_{mertic}" \
--alarm-description "aws ec2 {mertic}" \
--metric-name {mertic} \
--namespace AWS/EC2 \
--statistic Average \
--period 60 \
--threshold 5000000 \
--evaluation-periods 3 \
--datapoints-to-alarm 3 \
--comparison-operator GreaterThanOrEqualToThreshold \
--treat-missing-data notBreaching \
--alarm-actions "{action}" \
--ok-actions "{action}" \
--dimensions "Name=InstanceId,Value={id}"'''.format(cli=Contants['AWSCLI'], name=name, action=action, id=instance_id, mertic=mertic)


# 执行命令函数
def execCommand(comm):
    try:
        print(comm)
        (status, stdout) = subprocess.getstatusoutput(comm)
        print(status)
        return stdout
    except Exception as e:
        print(e)


# 获取当前可用区内所有EC2的基础信息
def getAll(get_server_id_list):
    # instanceids = ["i-0f24b7bf904ea9563" ,"i-0ce745e06c12cbde1"]
    # for instanceid in instanceids:
    #     print(instanceid)
    # comm1 = "%s ec2 describe-instances --instance-ids %s" % (Contants['AWSCLI'],instanceid)
    comm1 = "%s ec2 describe-instances" % Contants['AWSCLI']


    all_data = json.loads(execCommand(comm1))

    instance_list = []
    instance_list_modify = []
    for r in all_data['Reservations']:
        data = {}
        for i in r['Instances']:
            data['id'] = i['InstanceId']
            data['imageid'] = i['ImageId']
            data['instancetype'] = i['InstanceType']
            for t in i['Tags']:
                if t['Key'] == 'Name':
                    data['name'] = t['Value']
            if not data['name']:
                data['name'] = i['InstanceId']
        instance_list.append(data)
    # print(instance_list)
    for instance_id in instance_list:
        print(instance_id)
        if instance_id.get("id") in get_server_id_list:
            instance_list_modify.append(instance_id)
    #print(instance_list)
    print(instance_list_modify)
    return instance_list_modify


# 添加报警
def add_alert(data, action):
    for i in data:
        instance_id = i['id']
        name = i['name']
        imageid = i['imageid']
        instancetype = i['instancetype']
        print(instance_id, name, imageid, instancetype)
        #print(instance_id, name)
        execCommand(getCPUUtilizationComm(name, action, instance_id))
        #execCommand(getNetworkInComm(name, action, instance_id))
        #execCommand(getNetworkOutComm(name, action, instance_id))
        #execCommand(getmem_used_percentComm(name, action, instance_id, instancetype, imageid))
        #execCommand(getdisk_used_percentComm(name, action, instance_id, instancetype, imageid))



def get_server_info(instance_list):
    server_info = []
    # print(server_dict)
    for i in instance_list:
        # print(i)
        # 显示执行命令
        print("aws ec2 describe-instances  --output json --instance-ids {0}".format(i))
        # print(cmd)
        server_dict = {}
        data = os.popen("aws ec2 describe-instances  --output json --instance-ids {0}".format(i)).read()
        json_str = json.loads(data)
        """
        # print(json_str["Reservations"][0]["Instances"][0])
        server_dict['id']=json_str["Reservations"][0]["Instances"][0]["InstanceId"]
        server_dict['imageid']=json_str["Reservations"][0]["Instances"][0]["ImageId"]
        server_dict['instancetype']=json_str["Reservations"][0]["Instances"][0]["InstanceType"]
        if not json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]:
            server_dict['name'] = json_str["Reservations"][0]["Instances"][0]["InstanceId"]
        else:
            server_dict['name']=json_str["Reservations"][0]["Instances"][0]["Tags"][0]["Value"]
        server_info.append(server_dict)
        """
        for Reservations_list in json_str["Reservations"]:
            for Instances_list in Reservations_list["Instances"]:
                server_dict['id'] = Instances_list["InstanceId"]
                server_dict['imageid'] = Instances_list["ImageId"]
                server_dict['instancetype'] = Instances_list["InstanceType"]
                # if not Instances_list["Tags"][0]["Value"]:
                #     server_dict['name'] = Instances_list["InstanceId"]
                # else:
                #     server_dict['name'] = Instances_list["Tags"][0]["Value"]
                for tag_item in Instances_list["Tags"]:
                    name=tag_item["Key"]
                    if name == "Name":
                        server_dict['name'] = tag_item["Value"]
                        break
                if i == server_dict["id"]:
                    print(server_dict)
                    server_info.append(server_dict)
    return server_info

if __name__ == '__main__':
    # 2. 配置sns的arn
    sns_arn = "arn:aws:sns:eu-central-1:643xxxxx:xxxx-CloudWatch-Lambda-DingTalk"
    ip_list = ["i-010bxxxx","i-00xxxxx"]
    cli = Contants['AWSCLI']
    for i in Contants['AWSREGION']:
        print('[Region] ', i)
        Contants['AWSCLI'] = cli + ' --region ' + i
        add_alert(get_server_info(ip_list), sns_arn)


三、Amazon SNS创建主题

创建sns主题关联LAMBDA 钉钉程序

AWS CloudWatch监控之钉钉告警-鸿蒙开发者社区

四、Lambda钉钉函数通知脚本

AWS CloudWatch监控之钉钉告警-鸿蒙开发者社区

上传如下脚本,通过cloudwatch调式EC2设定的规则来触发告警测试

Prolog
# _*_coding:utf-8_*_
# python 3.8
# Creation time: 2021/11/18
import time
import hmac
import hashlib
import base64
import urllib.parse
import json
import os
import requests
import datetime


def lambda_handler(event, context):
    headers = {'Content-Type': 'application/json;charset=utf-8'}
    token = 'ca5533c8cb976c21'
    timestamp = str(round(time.time() * 1000))
    secret = 'SEC8d1a31ec5e8e91'
    secret_enc = secret.encode('utf-8')
    string_to_sign = '{}\n{}'.format(timestamp, secret)
    string_to_sign_enc = string_to_sign.encode('utf-8')
    hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
    sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))

    # get url
    api_url = "https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}".format(token, timestamp, sign)

    # msg setting
    #message = event['Records'][0]['SNS']
    message = event['Records'][0]['Sns']
    Timestamp = message['Timestamp']
    Subject = message['Subject']
    # sns_message = message['Message']
    sns_message = json.loads(message['Message'])
    NewStateReason = json.loads(event['Records'][0]['Sns']['Message'])['NewStateReason']
    current_time = (datetime.datetime.now() + datetime.timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')


    if "ALARM" in Subject:
        title = '![1.png](https://xxx.oss-cn.aliyuncs.com/dingding-image/1.png)'
    elif "OK" in Subject:
        title = '![2.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/2.png)'
    else:
        title = '![3.png](https://xxx.oss-cn-shanghai.aliyuncs.com/dingding-image/3.png)'

    _value = sns_message['Trigger']['Dimensions'][0]['value']
    if _value.startswith('/'):
        _value = sns_message['Trigger']['Dimensions'][1]['value']
    content = "### {title}".format(title=title) + \
              "\n> #### **时间**: "  + current_time  + \
              "\n> #### **状态**: " + sns_message['OldStateValue'] + " => " + sns_message['NewStateValue'] + \
              "\n> #### **告警名称**: " + sns_message['AlarmName'] + \
              "\n> #### **账户ID**: " + sns_message['AWSAccountId'] + \
              "\n> #### **AWS区域**: " + sns_message['Region'] + \
              "\n> #### **描述**: " + sns_message['AlarmDescription'] + \
              "\n> #### **产品资源**: " + sns_message['Trigger']['Namespace'] + \
              "\n> #### **实例ID**: " + _value + \
              "\n> #### **指标名称**: " + sns_message['Trigger']['MetricName'] + \
              "\n> #### **报警详情**: " + sns_message['NewStateReason']

    msg = {
        "msgtype": "markdown",
        "markdown": {
            "title": title,
            "text": content
        },
        "at": {
            "isAtAll": "true"
        }
    }

    # request
    request = requests.post(url=api_url, data=json.dumps(msg), headers=headers).content.decode("utf8")
    return request


Aws子账户权限调式工具

https://policysim.aws.amazon.com/


文章转载自公众号:新钛云服

分类
已于2023-7-12 17:32:22修改
收藏
回复
举报
回复
    相关推荐