机房间网络监控部署说明(最新版本请见机房网络监控报警部分--<<网络测试与机房选择>>)
机房间网络监控部署说明(最新版本请见机房网络监控报警部分--<<网络测试与机房选择>>)
更新 陈信
20150708更新(报警中加入目的地的拼音说明)
20150706更新(报警短信中加入IP对应的机房名称)
20140822更新(加入mtr路由日志记录)
20131223更新(加入up的时间记录日志)
20131211更新(加入down和up的日志记录到ping_error_address.txt文件)
记录格式为:
------------ down ------------
......
------------ up ------------
自用机房使用:
测试机房使用:
20131209 更新 (up和down方式后,报警短信中的歧义,加入 one_line, some_line, all_line)
20130910 更新 (up和down方式)
cat network_interrupt.sh #本次更新未更新至各测试主机,加入了网络down时发送1次报警(down),up时再发送1次(通知up状态),而不再是每隔20分钟发送报警短信.
!/bin/bash
Network connect or disconnect test current
Chenxin 20130118
update:20130205 20130625 tracert
update:20130910 add down and up status
update:20131202
[ -f /etc/init.d/functions ] && . /etc/init.d/functions || . /lib/lsb/init-functions
export PATH=$PATH:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
install traceroute software
which traceroute >/dev/null 2>&1
if [ "$?" != "0" ];then
which yum >/dev/null 2>&1;system_issue_centos=$?;
which apt-get >/dev/null 2>&1;system_issue_ubuntu=$?;
if [ "$system_issue_centos" == "0" ];then
yum install -y traceroute;
fi
if [ "$system_issue_ubuntu" == "0" ];then
apt-get install traceroute;
fi
fi
cd /root/admin/
get local ip address
ip_address=ifconfig|awk '$0~/inet addr:/&&$0!~/addr:192.168|addr:127.0.0|addr:10\./{print $2}'
mkdir -p /tmp/network_interrupt/
echo up > /tmp/network_interrupt/network_stable_status.txt
while true
do {
echo |
awk -v ip_address=$ip_address -v result_tmp=$result_tmp -v all_tmp=$all_tmp '
BEGIN {
#Define telecom and unicom ip address
telecom[0]="202.102.8.141"
telecom[1]="202.99.96.68"
#Define idc ip address
telecom[2]="114.112.69.229"
telecom[3]="121.101.210.1"
#Define telecom ip
#telecom[4]="202.96.209.133"
telecom[4]="121.14.60.118"
#Define idc ip address
telecom[5]="122.112.3.1"
telecom[6]="210.73.210.65"
telecom[7]="114.112.58.65"
}
{
#Get ping result 0 or 1 ,then add to the "result[i]" array.
for (i in telecom)
{
#ping 30 packets,if any loss ,then add the ip to the log file;
"ping "telecom[i]" -c 30| grep ttl |wc -l"|getline result[i];
if (result[i]<=20) {
system("echo --- date +%Y%m%d-%H%M%S " " "ip_address" to "telecom[i]" disconnect get "result[i]"/30>> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert start date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
system("traceroute -m 15 "telecom[i]" >> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert end date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
}
i=i+1;
}
}
END {
#Send the disconnect point address to phone;
for (i in result)
{
#if packet loss more then 33 percent,then send sms.
if (result[i]<=10) {
system("cat /tmp/network_interrupt/network_stable_status.txt|grep up && ./send_sms.sh "ip_address" to "telecom[i]" get"result[i]"/30_down ");
system("sleep 10");
}
}
#Statics the all result[i] value,if more than 2 site disconnect,then send sms.
for (i in result)
{
end_result+=result[i];
}
#8 network monitor point,then 30*8=240,240-30=210.if end_result more then 210 and less than 235,then down will keep down,up keep up.
if (end_result<=210)
{
#print "The end result is "end_result" ";
system("cat /tmp/network_interrupt/network_stable_status.txt|grep up && ./send_sms.sh "ip_address" get "end_result"/240 packets_down && echo down > /tmp/network_interrupt/network_stable_status.txt ")
system("echo ------ get "end_result"/240 data packets>> /tmp/network_interrupt/ping_error_address.txt ");
system("sleep 30");
}
else if (end_result>=235)
{
system("cat /tmp/network_interrupt/network_stable_status.txt|grep down && ./send_sms.sh "ip_address" get "end_result"/240 packets_up && echo up > /tmp/network_interrupt/network_stable_status.txt ")
}
else
{
exit; #exit awk process
}
system("sleep 10");
}
'
sleep 10
} done
20130625 更新(down后隔20分钟重新检测)
cat network_interrupt.sh
!/bin/bash
Network connect or disconnect test current
Chenxin 20130118
update:20130205 20130625 tracert
[ -f /etc/init.d/functions ] && . /etc/init.d/functions || . /lib/lsb/init-functions
export PATH=$PATH:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
install traceroute software
which traceroute >/dev/null 2>&1
if [ "$?" != "0" ];then
yum install -y traceroute;
fi
cd /root/admin/
:<<ZHUSHI
telecom
202.102.8.141
121.14.60.118
unicom
202.99.96.68
202.102.134.68
ZHUSHI
Judge the process working;
network_interrupt=ps axf|grep "network_interrupt.sh"|wc -l
if (("$network_interrupt" >= 5));then
echo "Too many same process is working... and then exit!!!!"
exit;
fi
get local ip address
ip_address=ifconfig|awk '$0~/inet addr:/&&$0!~/addr:192.168|addr:127.0.0|addr:10\./{print $2}'
mkdir -p /tmp/network_interrupt/
tmp variables
result_tmp="/tmp/result_tmp.txt"
all_tmp="/tmp/all_tmp.txt"
echo >"$result_tmp"
echo >"$all_tmp"
while true
do {
echo |
awk -v ip_address=$ip_address -v result_tmp=$result_tmp -v all_tmp=$all_tmp '
BEGIN {
#Define telecom and unicom ip address
telecom[0]="202.102.8.141"
telecom[1]="202.99.96.68"
#Define idc ip address
telecom[2]="114.112.69.229"
telecom[3]="121.101.210.1"
#Define telecom ip
#telecom[4]="202.96.209.133"
telecom[4]="121.14.60.118"
#Define idc ip address
telecom[5]="122.112.3.1"
telecom[6]="180.186.88.65"
telecom[7]="114.112.58.65"
}
{
#Get ping result 0 or 1 ,then add to the "result[i]" array.
for (i in telecom)
{
#ping 30 packets,if any loss ,then add the ip to the log file;
"ping "telecom[i]" -c 30| grep ttl |wc -l"|getline result[i];
if (result[i]<=20) {
system("echo date +%Y%m%d-%H%M%S " " "ip_address" to "telecom[i]" disconnect get "result[i]"/30>> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert start date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
system("traceroute -m 15 "telecom[i]" >> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert end date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
}
i=i+1;
}
}
END {
#Send the disconnect point address to phone;
for (i in result)
{
#if packet loss more then 33 percent,then send sms.
if (result[i]<=10) {
system("./send_sms.sh "ip_address" to "telecom[i]" get"result[i]"/30 ");
system("sleep 10");
}
}
#Statics the all result[i] value,if more than 2 site disconnect,then send sms.
for (i in result)
{
end_result+=result[i];
}
#7 network monitor point,then 30*8=240,240-30=210
if (end_result<=210)
{
#print "The end result is "end_result" ";
system("./send_sms.sh "ip_address" get "end_result"/240 packets ");
system("echo ---------- get "end_result"/240 data packets>> /tmp/network_interrupt/ping_error_address.txt ");
system("sleep 1200");
exit; #exit awk process
}
system("sleep 10");
}
'
sleep 10
} done
参考: 用在机房选择中的闪断报警程序(最新的请见<<网络测试与机房选择部分>>)
!/bin/bash
Network connect or disconnect test current
Chenxin 20130118
update:20130205 20130625 tracert
update:20130910 add down and up status
update:20131101 modify network_stable_status and grep
update:20131104 for IDC interupt test only. iptables -A OUTPUT -p icmp -j DROP;sleep 15;iptables -D OUTPUT 1;
[ -f /etc/init.d/functions ] && . /etc/init.d/functions || . /lib/lsb/init-functions
export PATH=$PATH:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
install traceroute software
which traceroute >/dev/null 2>&1
if [ "$?" != "0" ];then
which yum >/dev/null 2>&1;system_issue_centos=$?;
which apt-get >/dev/null 2>&1;system_issue_ubuntu=$?;
if [ "$system_issue_centos" == "0" ];then
yum install -y traceroute;
fi
if [ "$system_issue_ubuntu" == "0" ];then
apt-get install traceroute;
fi
fi
cd /home/admin/
get local ip address
ip_address=ifconfig|awk '$0~/inet addr:/&&$0!~/addr:192.168|addr:127.0.0|addr:10\./{print $2}'
mkdir -p /tmp/network_interrupt/
echo up > /tmp/network_interrupt/network_stable_status.txt
while true
do {
echo |
awk -v ip_address=$ip_address -v result_tmp=$result_tmp -v all_tmp=$all_tmp '
BEGIN {
#Define telecom and unicom ip address
telecom[0]="202.102.8.141"
telecom[1]="202.99.96.68"
telecom[2]="121.14.60.118"
telecom[3]="121.101.210.18"
}
{
#Get ping result 0 or 1 ,then add to the "result[i]" array.
for (i in telecom)
{
#ping 30 packets,if any loss ,then add the ip to the log file,start tracert;
"ping "telecom[i]" -c 30| grep ttl |wc -l"|getline result[i];
if (result[i]<=26) {
system("echo --- date +%Y%m%d-%H%M%S " " "ip_address" to "telecom[i]" disconnect get "result[i]"/30>> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert start date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
system("traceroute -m 15 "telecom[i]" >> /tmp/network_interrupt/ping_error_address.txt ");
system("echo tracert end date +%Y%m%d-%H%M%S >> /tmp/network_interrupt/ping_error_address.txt ");
}
i=i+1;
}
}
END {
#Send the disconnect point address to phone;
for (i in result)
{
#if packet loss more then 33 percent,then send sms.
if (result[i]<=10) {
system("cat /tmp/network_interrupt/network_stable_status.txt|grep up && ./send_sms.sh "ip_address" to "telecom[i]" get"result[i]"/30_down ");
system("sleep 10");
}
}
#Statics the all result[i] value,if more than 2 site disconnect,then send sms.
for (i in result)
{
end_result+=result[i];
}
#4 network monitor point,then 30*3=240,120-10=110
if (end_result<=110)
{
#print "The end result is "end_result" ";
system("cat /tmp/network_interrupt/network_stable_status.txt|grep up && ./send_sms.sh "ip_address" get "end_result"/120 packets_down && echo down > /tmp/network_interrupt/network_stable_status.txt ")
system("echo ------ get "end_result"/110 data packets >> /tmp/network_interrupt/ping_error_address.txt ");
system("sleep 30");
}
else if (end_result>=116)
{
system("cat /tmp/network_interrupt/network_stable_status.txt|grep down && ./send_sms.sh "ip_address" get "end_result"/120 packets_up && echo up > /tmp/network_interrupt/network_stable_status.txt ")
}
else
{
exit; #exit awk process
}
system("sleep 10");
}
'
sleep 10
} done
机房间网络监控部署说明(最新版本请见机房网络监控报警部分--<<网络测试与机房选择>>)
20130205 陈信
备注:以下是比较老的版本.
逻辑说明:
一共监测5个节点,其中1个电信节点,1个联通节点,3个为机房节点;
部署于3个机房(兆维/东四/通管),当发现1个节点丢包超过20%,则记录到日志文件中/tmp/...txt;
当任意节点丢包超过50%,发送报警短信;
5个节点一共ping了50次,如果只有40次接收的包正常的话,则认为有问题,将收到包的数量记录到日志文件中.并发送报警短信;然后等待20分钟,再执行;
死循环执行,crontab中为:
56 15 * * * killall network_interrupt.sh;/root/admin/network_interrupt.sh;
以下为2017/08/03更新(脚本修改为二进制加壳文件后,原killall失效)
network_interupt test
02 14 * * * ps axf|grep "network_interrupt.sh"|grep -v "_ grep" |awk '{ print $1 }'|xargs kill -9;killall awk;/root/admin/network_interrupt.sh;
假设3个机房联通链路有问题,则其他2个机房会发送报警短信;
假设3个机房任意一个的电信有问题时:
兆维机房电信故障,兆维监控到外部一个电信节点丢包严重,则会通过报警短信报警(报警短信走的是联通链路).
东四机房同样;
通管机房则不同,通管机房电信故障时,发送短信会默认用电信的出口发送,但电信本身是断开的,故理论上收不到通管机房电信链路的故障短信.除非通管机房做了链路处理,在电信故障时,所有流量通过联通进行发送.
注意下此程序中的ping调用方式,和标准的有区别:
echo |awk 'BEGIN{system("ping 114.112.69.113 -c 2")} {"ping www.baidu.com"} END {"ping www.google.com"}'
这行只会执行第一个ping,其他的没有system说明就执行不了,但下面的程序中却狗屎运的执行起来了,不知为何?!
!/bin/bash
Network connect or disconnect test current
Chenxin 20130118
update:20130205
[ -f /etc/init.d/functions ] && . /etc/init.d/functions || . /lib/lsb/init-functions
export PATH=$PATH:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
cd /root/admin/
:<<ZHUSHI
telecom
202.102.8.141
121.14.60.118
unicom
202.99.96.68
202.102.134.68
ZHUSHI
Judge the process working;
network_interrupt=ps axf|grep "network_interrupt.sh"|wc -l
if (("$network_interrupt" >= 5));then
echo "Too many same process is working... and then exit!!!!"
exit;
fi
get local ip address
ip_address=ifconfig|awk '$0~/inet addr:/&&$0!~/addr:192.168|addr:127.0.0|addr:10\./{print $2}'
mkdir -p /tmp/network_interrupt/
tmp variables
result_tmp="/tmp/result_tmp.txt"
all_tmp="/tmp/all_tmp.txt"
echo >"$result_tmp"
echo >"$all_tmp"
while true
do {
echo |
awk -v ip_address=$ip_address -v result_tmp=$result_tmp -v all_tmp=$all_tmp '
BEGIN {
#Define telecom and unicom ip address
telecom[0]="202.102.8.141"
telecom[1]="202.99.96.68"
#Define idc ip address
telecom[2]="114.112.69.113"
telecom[3]="121.101.210.1"
telecom[4]="122.112.3.1"
}
{
#Get ping result 0 or 1 ,then add to the "result[i]" array.
for (i in telecom)
{
#ping 10 packets,if any loss ,then add the ip to the log file;
"ping "telecom[i]" -c 10| grep ttl |wc -l"|getline result[i]; #之前这里是通过system函数调用的
if (result[i]<=8) {
system("echo date +%Y%m%d-%H%M%S " " "ip_address" to "telecom[i]" disconnect get "result[i]"/10>> /tmp/network_interrupt/ping_error_address.txt ");
}
i=i+1;
}
}
END {
#Send the disconnect point address to phone;
for (i in result)
{
#if packet loss more then 50 percent,then send sms.
if (result[i]<=5) {
system("./send_sms.sh "ip_address" to "telecom[i]" get"result[i]"/10 ");
}
}
#Statics the all result[i] value,if more than 2 site disconnect,then send sms.
for (i in result)
{
end_result+=result[i];
}
#7 network monitor point,then 10*7=70,70-10=60
if (end_result<=40)
{
#print "The end result is "end_result" ";
system("./send_sms.sh "ip_address" get "end_result"/50 packets ");
system("echo get "end_result"/50 data packets>> /tmp/network_interrupt/ping_error_address.txt ");
system("echo ---------- >> /tmp/network_interrupt/ping_error_address.txt ");
system("sleep 1200");
exit; #exit awk process
}
system("sleep 12");
}
'
sleep 10
} done

浙公网安备 33010602011771号