监控系统告警脚本集合

告警系统对接原则:以脚本执行的返回值为准,如返回值为0则表示正常,返回值为1则表示异常,根据预配置内容发出告警短信or邮件。

mysql拨测监控告警

#!/bin/bash
result=`/apps/svr/mysql_3306/bin/mysql -uuserAndPassword -puserAndPassword -h127.0.0.1 -N -e "select 1" 2>/dev/null`
if [[ $result -eq 1 ]]; then
 echo "select 1 is OK"
 exit 0
else
 echo "ERROR,select 1 is not OK"
 exit 1
fi

mysql连接数超过90%告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin

re_status=`$pwd/mysql -u$user  -p$password -h$host --port=$port -N -e "show status like 'Threads_running'" 2>/dev/null |awk -F ' ' '{print $2}'`

re_variables=`$pwd/mysql -u$user  -p$password -h$host --port=$port  -N -e "show global variables like 'max_connections'" 2>/dev/null |awk -F ' ' '{print $2}'`

result=`awk 'BEGIN{printf "%.2f%\n",('$re_status'/'$re_variables')*100}'`

if [[ $result < 90% ]];then
 echo "连接数正常"
 exit 0
else
 echo "当前连接数超过90%,告警!"
 exit 1
fi

mysql主从状态监控异常告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin


# 监控主从同步状态,主从同步断开既告警# 同时监控主从同步延时时间Seconds_Behind_Master,当时间大于60秒告警# 需要监控账号 repl 具有"replication client" 权限:grant replication client on *.*  to repl@'%' ;

DATE=`date +"%Y-%m-%d %H:%M:%S"`    #当前日期时间
MYSQLTEST=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null |wc -l`
if [ $MYSQLTEST -ne 0 ];then    #该机器为从库
  SLAVE_STATUS=`$pwd/mysql -u$user -p$password -h$host --port=$port -S /apps/run/mysql_3306/mysql.sock -e "show slave status\G" 2>/dev/null | egrep -i "running|Seconds_Behind_Master"`
  IO_env=`echo $SLAVE_STATUS | grep IO | awk  ' {print $2}'`
  SQL_env=`echo $SLAVE_STATUS | grep SQL | awk  '{print $4}'`
  Seconds_Behind_env=`echo $SLAVE_STATUS |grep Seconds_Behind_Master |awk  ' {print $6}'`
  if [ "$IO_env" = "Yes" ]&&[ "$SQL_env" = "Yes" ];then    #主从同步正常
    echo "[$DATE] [INFO] Master-slave synchronization is running!"
  else    #主从同步断开,告警
    echo "[$DATE] [ERROR] Master-slave synchronization is not running!"
    exit 1
  fi
  ## 监控延时时间Seconds_Behind_Master
  if [ "$Seconds_Behind_env" != "NULL" ]&&[ $Seconds_Behind_env -gt 60 ];then    #主从同步延时大于60秒,告警
    echo "[$DATE] [ERROR] Master-slave synchronization delay time is greater than 60 seconds!"
    exit 1
  fi
else
  echo "Master"    #该机器为主库或者单机
fi
exit 0

mysql集群未提交长事务监控异常告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin



DATE=`date +"%Y-%m-%d %H:%M:%S"`
MYSQLCOMMIT=` $pwd/mysql -u$user -p$password -h$host --port=$port -e "select a.id as conn_id, time_to_sec(timediff(now(),b.trx_started)) as trx_open_seconds from information_schema.processlist a right outer join information_schema.innodb_trx b on a.id = b.trx_mysql_thread_id  where  time_to_sec(timediff(now(),b.trx_started))>60;" 2>/dev/null |grep -v trx_open_seconds |wc -l`
if [ $MYSQLCOMMIT -gt 50 ];then
    echo "[$DATE] [WARNING] 事务超过60秒未提交数量超过50个!"
    exit 1

  else
    echo "[$DATE] [INFO] 事务超过60秒未提交数量: $MYSQLCOMMIT"
    exit 0
fi

mysql缓存命中率

#!/bin/bash
#采集间隔时间,单位s
asleep=10
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
probe_file=probe.log

Innodb_buffer_read_hit_ratio=0


getBufferRatio(){
    a=$(/data01/svr/mysql_${port}/bin/mysql -u${user} -p${password} -h${host} -e "show global status like 'Innodb_buffer_pool%';" |grep -i "Innodb_buffer_pool_read_requests\|Innodb_buffer_pool_reads" | awk '{if(length($3)==0) print $0 ;else print $3}'| tr -t '\n'' ');
    Innodb_buffer_pool_read_requests=$(echo $a | awk '{print $2}')
    Innodb_buffer_pool_reads=$(echo $a | awk '{print $4}')
    Innodb_buffer_read_hit_ratio=`awk 'BEGIN{printf "%.2f\n",(1-'$Innodb_buffer_pool_reads'/'$Innodb_buffer_pool_read_requests')*100}'`
}
getBufferRatio
echo $(date "+%Y-%m-%d_%H:%M:%S") $Innodb_buffer_read_hit_ratio

if [ ${Innodb_buffer_read_hit_ratio%.*} -lt 99 ];then
    echo "[$DATE] [WARNING] buffer命中率低于99!"
    exit 1

  else
    echo "[$DATE] [INFO] buffer命中率: $Innodb_buffer_read_hit_ratio"
    exit 0
fi

MySQL锁表监控告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0


a=`$pwd/mysql -u$user -p$password -h$host --port=$port -e "SHOW  STATUS LIKE 'Innodb_row_lock_current_waits'" 2>/dev/null |grep Innodb_row_lock_current_waits|awk '{print $2}'`


if [ $a -gt 0 ];then
    echo "[$DATE] [WARNING] 出现锁表!!"
    exit 1

  else
    echo "[$DATE] [INFO] 锁表检查正常。"
    exit 0
fi

QPS大于10000告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0



Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
QPS() {
 Questions=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $6}' `
}
QPS
Ratio=`awk 'BEGIN{ printf "%.2f\n",'$Questions'/'$Uptime'}'`

if [ ${Ratio%.*} -gt 10000 ];then
    echo "[$DATE] [WARNING] QPS大于10000!"
    exit 1

  else
    echo "[$DATE] [INFO] 当前QPS为: $Ratio"
    exit 0
fi

TPS大于4000告警

#!/bin/bash
user=userAndPassword
password=userAndPassword
host=127.0.0.1
port=3306
pwd=/data01/svr/mysql_3306/bin
Ratio=0



Uptime=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port status 2>/dev/null | awk '{print $2}' `
TPS() {
 rollback=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_rollback\>/{print $4}'`
 commit=`$pwd/mysqladmin -u$user -p$password -h$host --port=$port extended-status 2>/dev/null | awk '/\<Com_commit\>/{print $4}'`
}
TPS
TpsRatio=`awk 'BEGIN{printf "%.2f\n",'$(($rollback+$commit))'/'$Uptime'}'`


if [ ${TpsRatio%.*} -gt 4000 ];then
    echo "[$DATE] [WARNING] TPS大于4000!"
    exit 1

  else
    echo "[$DATE] [INFO] 当前QPS为: $TpsRatio"
    exit 0
fi

haproxy后端状态监测

#!/bin/bash 
result=`curl -s  //admin:[email protected]:2000/status |grep -E 'order01|base|cust|idservice|irsc|sec|upc|ewe|ftpgw ' |grep DOWN |wc -l` 
if [[ $result = 2 ]]; then 
   exit 0 
else  
   exit 1 
fi

JAVA内存溢出告警-OutOfMemory

#!/bin/bash 
result=`tail -1000 /apps/logs/svc/svc-node01-order01-`date +%m%d`.log | grep 'java.lang.OutOfMemoryError' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis集群状态异常告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_state | grep ok | wc -l`
if [[ $result = 1 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis节点状态监测告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_known_nodes |awk -F ":"  '{print $2}'`
if [[ $result = 6 ]]; then 
   exit 0 
else  
   exit 1 
fi

redis_slot异常告警

#!/bin/bash 
result=`/apps/svr/ids/redis-cli  -c -p 11001  -h 127.0.0.1 cluster info |grep cluster_slots_ok |awk -F ":"  '{print $2}'`
if [[ $result = 16384 ]]; then 
   exit 0 
else  
   exit 1 
fi

keepalivedVIP连通性监控告警

#!/bin/sh
VIPS=`cat /etc/keepalived/keepalived.conf |grep -v real|grep -v smtp_server|grep -v 127.0.0.1|grep -v '#'|grep -E -o "\b([0-9]{1,3}\.){3}[0-9]{1,3}\b"|sort -nu`
num=0
for ips in $VIPS
do
        result=`ping -w 2 -c 3 ${ips} | grep packet | awk -F" " '{print $6}'| awk -F"%" '{print $1}'| awk -F' ' '{print $1}'`
        if [ $result -eq 0 ]; then
              num=$num
        else
              let num=$num+1
        fi
done
    if [ $num -eq 0 ]; then
  exit 0
    else
  exit 1
 fi

keepalivedVIP丢失告警

#!/bin/bash
ip_count=`ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l`
if [ ! -f /tmp/check_vip.log ];then
ip a |grep inet|grep -v 127.0.0.1|grep -v inet6 |wc -l > /tmp/check_vip.log
else
vip_count=`sudo cat /tmp/check_vip.log`
if [ "$ip_count" == "$vip_count" ];then
echo "vip ok"
exit 0
else
echo $ip_count > /tmp/check_vip.log
exit 1
fi
fi

keepalived脑裂预警

这里解释一下,由于脑裂的验证需要结合多个节点的情况,监控脚本尽量不做的过于复杂,因此这里在keepalived备节点做了监控,只要发现VIP切换即发出告警,人为接入检查脑裂情况。

#!/bin/bash 
result=`ip addr  |grep 32  | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else    
   echo "keepalived从节点出现32位的vip,可能出现脑裂现象"
   exit 1 
fi

rocketmq集群节点数量监控告警

#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
declare -x JAVA_HOME="/apps/tools/jdk"
NumCluster=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin  clusterList -n 127.0.0.1:9876 |grep -v Version|wc -l`
if [ $NumCluster -gt 4 ];then
    echo "[$DATE] [WARNING] rocketMQ集群节点小于4个!"
    exit 1

  else
    echo "[$DATE] [INFO] rocketMQ集群节点数量为: $NumCluster"
    exit 0
fi

rocketmq消息数量异常告警

#!/bin/bash
DATE=`date +"%Y-%m-%d %H:%M:%S"`
export JAVA_HOME=/apps/tools/jdk
export JAVA_BIN=/apps/tools/jdk/bin
NumTopic=`sudo -E /apps/svr/mqbroker/rocketmq/bin/mqadmin topicList -n 127.0.0.1:9876 2>/dev/null|grep Blue  |grep CIDC |grep -v RETRY|wc -l`
if [ $NumTopic -lt 50 ];then
    echo "[$DATE] [WARNING] rocketMQ消息主题小于50个!"
    exit 1

  else
    echo "[$DATE] [INFO] rocketMQ消息topic数量当前为: $NumTopic"
    exit 0
fi

logstash日志报错告警

#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch集群个数异常告警

#!/bin/bash 
result=`curl //10.172.95.1:9201/_cluster/health?pretty |grep number_of_nodes | awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch数据节点个数异常告警

#!/bin/bash 
result=`curl //10.172.95.1:9201/_cluster/health?pretty |grep number_of_data_nodes | awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result = 4 ]]; then 
   exit 0 
else  
   exit 1 
fi

elasticsearch_java内存使用已超过48G

#!/bin/bash 
#51539607552=48G
result=` curl //10.172.95.1:9201/_cluster/stats?pretty  |grep heap_used_in_bytes |awk  -F ":" '{print $2}' |sed s'/.$//'`
if [[ $result > 51539607552 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper日志告警

#!/bin/bash
result=`tail -2000 /apps/logs/logstash/logstash-plain.log | grep 'error' | wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper集群follower-mode变更告警

#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status  |grep follower |wc -l` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

zookeeper集群leader-mode变更告警

#!/bin/bash
result=`/apps/sh/zk/zkServer.sh status  |grep leader|wc -l ` 
if [[ $result = 0 ]]; then 
   exit 0 
else  
   exit 1 
fi

api接口层面监控告警

#!/bin/bash
result=`curl --location --request GET '//10.172.95.186:8000/emop?appId=600006&method=SYAN_UNHQ_queryOfferStatus&channelTypeId=0&flowdId=202006091314501278181&format=json&status=1%0A' --header 'Content-Type: text/plain' --data '{  "productType": "vm"}' -w "\n" |grep OK  | wc -l`
if [[ $result = 1 ]]; then 
   exit 0 
else  
   exit 1 
fi
Tags: