先上图\
#!/bin/bash
#author: QingFeng
#qq: 530035210
#blog: https://my.oschina.net/pwd/blog
#自动监控url是否可用,如不可用则重启应用,并做相应的报警策略。
#缺省的配置如下
logdir=/data/log/check #日志路径
log=$logdir/log.log #日志文件
is_font=1 #终端是否打印日志: 1打印 0不打印
is_log=0 #是否记录日志: 1记录 0不记录
key="data-camp" #进程关键字
exec_stop="/etc/init.d/data-camp stop" #停应用命令
exec_start="/etc/init.d/data-camp start" #启动应用命令
datef(){
date "+%Y-%m-%d %H:%M:%S"
}
print_log(){
if [[ $is_log -eq 1 ]];then
[[ -d $logdir ]] || mkdir -p $logdir
echo "[ $(datef) ] $1" >> $log
fi
if [[ $is_font -eq 1 ]];then
echo -e "[ $(datef) ] $1"
fi
}
#定义重启
derestart(){
if [[ $1 == "" ]];then
print_log "$FUNCNAME():应用关键字不能为空"
exit
fi
if [[ $2 == "" ]];then
print_log "$FUNCNAME():启动文件不能为空"
exit
fi
if [[ $2 == "" ]];then
print_log "$FUNCNAME():启动参数口不能为空"
exit
fi
ppid=0
ppid=$(ps axu |grep "$1" |grep -v grep |grep -v "$0" |wc -l)
$2 $3
ppid=$(ps axu |grep "$1" |grep -v grep |grep -v "$0" |wc -l)
echo $ppid > /tmp/restart.num
print_log "$FUNCNAME(): $1的进程数为:$ppid"
}
#场景一: 当网站返回码不为200,则重启应用.
check_code(){
if [[ $1 == "" ]];then
print_log "$FUNCNAME():服务器地址不能为空"
exit
fi
if [[ $2 == "" ]];then
print_log "$FUNCNAME():服务器端口不能为空"
exit
fi
print_log "$FUNCNAME():开始检测-[$1:$2]服务器的网站状态返回码."
code=$(curl -m 8 -o /dev/null -s -w %{http_code} https://$1:$2/verdict/session/LSGJA52U7CH055974/latest/result)
if [[ $code -ne 200 ]];then
print_log "$FUNCNAME():[$1:$2]服务器的网站状态返回码不正常,开始重启应用--$code."
print_log "$FUNCNAME():执行命令: $exec_stop"
derestart "$key" "$exec_stop"
num2=$(cat /tmp/restart.num)
if [[ $num2 -ne 0 ]];then
print_log "$FUNCNAME():停应用失败."
fi
print_log "$FUNCNAME():执行命令: $exec_start"
sleep 3
derestart "$key" "$exec_start"
num2=$(cat /tmp/restart.num)
if [[ $num2 -eq 0 ]];then
print_log "$FUNCNAME():启动应用失败."
fi
print_log "$FUNCNAME():重启应用成功."
else
print_log "$FUNCNAME():[$1:$2]服务器的网站状态返回码正常--$code."
fi
}
#场景二: 检测网站http返回的时间
check_timeout(){
if [[ $1 == "" ]];then
print_log "$FUNCNAME():服务器地址不能为空"
exit
fi
if [[ $2 == "" ]];then
print_log "$FUNCNAME():服务器端口不能为空"
exit
fi
print_log "$FUNCNAME():开始检测-[$1:$2]服务器的网站超时时间."
httptime=`curl -o /dev/null -s -w "time_connect: %{time_connect}\ntime_starttransfer:%{time_starttransfer}\ntime_total: %{time_total}\n" "https://$1:$2/verdict/session/LSGJA52U7CH055974/latest/result" |grep time_total|awk -F ":" '{print $2*1000}'`
taketime=$(expr $httptime / 1000)
if [[ $httptime -gt 60000 ]];then
print_log "$FUNCNAME():[$1:$2]服务器的网站响应时间不正常,开始重启应用--$httptime ms."
print_log "$FUNCNAME():执行命令: $exec_stop"
derestart "$key" "$exec_stop"
num2=$(cat /tmp/restart.num)
if [[ $num2 -ne 0 ]];then
print_log "$FUNCNAME():停应用失败."
fi
print_log "$FUNCNAME():执行命令: $exec_start"
sleep 3
derestart "$key" "$exec_start"
num2=$(cat /tmp/restart.num)
if [[ $num2 -eq 0 ]];then
print_log "$FUNCNAME():启动应用失败."
fi
print_log "$FUNCNAME():重启应用成功."
else
print_log "$FUNCNAME():[$1:$2]服务器的网站响应时间正常--$httptime ms/$taketime s."
fi
}
check_code "localhost" "6500"
check_timeout "localhost" "6500"