二进制文件进程守护脚本

start.sh守护脚本

进程守护脚本

  • 使用方法
[ccodsupport@harbor DelRecord]$ ./start.sh  -h
Usage: start.sh [options] service1 [,service2..]
Start/stop services or show files version
  -c	Get coredump, used with -k.
  -d	Run a daemon script for service
  -f	Force kill service, used with -k
  -h	Print help infomation
  -k	Kill service
  -o	Get owner authority [ -o user ]
  -r	Restart service 
  -s	Show service status
  -v	Show file version
  list  Show all services status
  Note: If no option ,default to start a service. 

Valid shortcut for services:
  del 
  
Example: 
  "start.sh list"	View status of all services.
  "start.sh del"	 Start DelRecord.
  "start.sh -d del"  Start DelRecord with a daemon.
  "start.sh del " Start DelRecord.
  "start.sh -v del"  Show DelRecord version.
  "start.sh -s del"  Show DelRecord status.
  "start.sh -k del"  Stop DelRecord.
  "start.sh -r del"  Restart DelRecord.
  "start.sh -kd del" Stop DelRecord and its daemon.
  "start.sh -kf del" Force,kill DelRecord immediately.
  "start.sh all"	 Start all services at once.
  "start.sh -k all"  Stop all services at once.
  
[devops@my-dev DelRecord]$ ./start.sh  -kdf  del
stoping DelRecord ...                               [stopped]

[devops@my-dev DelRecord]$ ./start.sh   del
starting DelRecord ....                              [ok]

[devops@my-dev DelRecord]$./start.sh  list
DelRecord (pid 22081 22084)                         [running]
checkmen                                            [stopped]
  • 脚本源码
#!/bin/bash

#########################################################################################
################				 service info ###########################################
#########################################################################################
ALIAS[0]="del"; #定义程序别名,多个二进制管理依次累加即可
ALIAS[1]="men";

PROC[0]="DelRecord"; #程序名,多个二进制管理依次累加即可
PROC[1]="checkmen"; #程序名,多个二进制管理依次累加即可

DIR[0]="./";
DIR[1]="./bin";  #二进制所在目录

ARGV[0]="";
ARGV[1]="--config ../cfg/checkmen.cfg";  #执行checkmen 二进制文件配置文件


PROC_COUNT=1
CHECK_TIME=10

#CHECK_LOGFILE="check.log"
CHECK_LOGFILE="/dev/null"

#########################################################################################

################################################################################33

usage()
{
	echo  "Usage: start.sh [options] service1 [,service2..]";
	echo  "Start/stop services or show files version" 
	echo  "  -c	Get coredump, used with -k."
	echo  "  -d	Run a daemon script for service"
	echo  "  -f	Force kill service, used with -k" 
	echo  "  -h	Print help infomation"
	echo  "  -k	Kill service"
	echo  "  -o	Get owner authority [ -o user ]"					   
	echo  "  -r	Restart service "
	echo  "  -s	Show service status"
	echo  "  -v	Show file version"
	echo  "  list  Show all services status"
	echo  "  Note: If no option ,default to start a service. "
	echo  ""
	echo  "Valid shortcut for services:"
	echo  "  del "
	echo  "  "
	echo  "Example: "
	echo  "  \"start.sh list\"	View status of all services."
	echo  "  \"start.sh del\"	 Start DelRecord."
	echo  "  \"start.sh -d del\"  Start DelRecord with a daemon."
	echo  "  \"start.sh del \" Start DelRecord."
	echo  "  \"start.sh -v del\"  Show DelRecord version."
	echo  "  \"start.sh -s del\"  Show DelRecord status."
	echo  "  \"start.sh -k del\"  Stop DelRecord."
	echo  "  \"start.sh -r del\"  Restart DelRecord."
	echo  "  \"start.sh -kd del\" Stop DelRecord and its daemon."
	echo  "  \"start.sh -kf del\" Force,kill DelRecord immediately."
	echo  "  \"start.sh all\"	 Start all services at once."
	echo  "  \"start.sh -k all\"  Stop all services at once."
}

echo_w()
{
	width=`expr 60 - $1`
	f=`printf "%ds" $width`
	printf  "%$f\n"  "$2"
}
set_ulimit() 
{
	core_limit=`ulimit -c`
	[ $core_limit="0" ]
	if [ $? -eq "0" ]; then
		ulimit -c unlimited
		#echo "ulimit set"
	fi;
}

checkDaemonRunning()
{
	ret=`ps -u $WHO -o pid -o comm -o cmd|grep "$SCRIPT_NAME -a -d $1"|grep -v grep|awk '{print $1}'`;
	echo $ret
	if [ "$ret" = "" ]; then
		return 0;
	else
		return 1;
	fi
}

checkRunning()
{
	ret=`ps -u $WHO -o pid -o comm|grep -w $1|awk '{print $1}'`;
	echo $ret

	if [ -z "$ret" ]; then
		return 0;
	else
		return 1;
	fi
}


show_status()
{
	proc=${PROC[$1]}
	ret=`checkRunning $proc`;
	if [ $? -ne 0 ];then
		str="$proc (pid $ret)";
		len=`expr length "$str"`;
		echo -n $str
		ret=`checkDaemonRunning ${ALIAS[$1]}`;
		if [ $? -ne 0 ];then
			echo_w `expr $len - 10` "[daemon][running]";
		else
			echo_w `expr $len - 10` "[running]";
		fi
	else
		echo -n $proc
		len=`expr length "$proc"`;
		echo_w `expr $len - 10` "[stopped]";
	fi
}

restart_proc()
{
	kill_proc $1;
#	[ $? -ne 0 ]&& return; 		
	start_proc $1;
}

kill_daemon()
{
	len=`expr length "${PROC[$1]} daemon"`
	ret=`checkDaemonRunning ${ALIAS[$1]}`;	
	if [ $? -ne 0 ];then
		echo -n "stoping ${PROC[$1]} daemon .";	
		for pid in "$ret"
		do
			kill -9 $pid;
		done
		echo_w $len "[stopped]";
	fi	
}
kill_proc()
{
	result=1;
	[ "$DAEMON" = "true" ] && kill_daemon $1;
	proc=${PROC[$1]}
	echo -n "stoping $proc .";	
	len=`expr length "$proc"`
	ret=`checkRunning $proc`;	
	if [ $? -ne 0 ];then
		for pid in "$ret"		
		do
			if [ "$FORCE" = "true" ];then
				kill -9 $pid;
			elif [ "$CORE" = "true" ];then
				kill -6 $pid;
			else
				kill -9 $pid;
			fi
			for((t=0; t<10; t++))
			do
				echo -n "."
				len=`expr $len + 1`
				ret=`checkRunning $proc`;	
				if [ $? -ne 0 ];then
					sleep 1;
				else
					echo_w $len "[stopped]";
					result=0;
					break;
				fi
				if [ $t -eq 8 ];then
					echo_w $len "[running]";
					KILLFAILED=true;
				fi
					
			done
		done
	else
		echo -n ".."
		echo_w `expr $len + 2` "[stopped]";
	fi
	return $result;
}

start_daemon()
{
	$SCRIPT_NAME -a -d ${ALIAS[$1]} >/dev/null 2>&1 &
	echo  "starting daemon for ${PROC[$1]}... OK";				
#	ret=`checkDaemonRunning ${ALIAS[$1]}`;
#	if [ $? -ne 0 ];then
#		echo  "already a instance running ...";
#	else	
#		$SCRIPT_NAME -a -d ${ALIAS[$1]} >/dev/null 2>&1 &
#		echo  "starting daemon for ${PROC[$1]}... OK";				
#	fi
}

start_real_daemon()
{
	while true
	do
		start_proc $1
		if [ $? = 2 ];then
			echo "`date`: start daemon ${PROC[$1]} ok" >> daemon.log
		elif [ $? = 3 ];then
			echo "`date`: start daemon ${PROC[$1]} failed" >> daemon.log
		fi
		sleep $CHECK_TIME
	done		
}

start_ss()
{
	dcslog="../log/dcs/dcs.log"
	wcdcsbegin=`wc -l $dcslog |awk '{print $1}'`

	while [ 1 ]
	do
		wcdcsend=`wc -l $dcslog |awk '{print $1}'`
		if [ "$wcdcsend" != "$wcdcsbegin" ]; then
			ret=`sed -n "$wcdcsbegin,$wcdcsend p" $dcslog |grep 'HeartBeat succeeds'`
			if [ "$ret" != "" ]; then
				nohup ./StatSchedule ../cfg/ss_config.cfg >/dev/null 2>&1 &
				break;
			fi
			wcdcsbegin=$wcdcsend
		fi
		echo -ne "."
		sleep 3
	done
}

start_proc()
{
	proc=${PROC[$1]}	
	echo -n "starting $proc ";
	len=`expr length "$proc"`
		ret=`checkRunning $proc`;
	cret=$?;
	if [ "$ISLIST" != "true" ]; then
		echo "`date` check $proc return=[$ret] [$cret]" >> $CHECK_LOGFILE
	fi
	if [ $cret -ne 0 ] || [ "$ret" != "" ]
	then
		echo -n ".."
		echo_w `expr $len + 2` "[FAILED]"
		echo "Error:$proc already have a instance (pid $ret)";
		return 1
	else
		cd ${DIR[$1]}
		if [ "$STARTSS" = "true" ]; then
			start_ss
		else
			nohup ./$proc ${ARGV[$1]}  >/dev/null 2>&1 &
		fi
		cd - >> /dev/null 2>&1
		for t in 1 2 3
		do 
			echo -n "."	
			len=`expr $len + 1`
			sleep 1
		done
		echo -n "."
		ret=`checkRunning $proc`;
		if [ $? -ne 0 ];then
		   	echo_w `expr $len + 1` "[  OK  ]";
			return 2
		else
		   	echo_w `expr $len + 1` "[FAILED]";
			return 3
		fi
	fi
}

show_version()
{
	echo "====================== ${PROC[$1]} Version Info ======================";	
	cd bin
	./${PROC[$1]} --version
	cd ..		
}

do_process()
{
	if [ "$KILL" = "true" ];then
		kill_proc $1;
		return;
	fi
	if [ "$RESTART" = "true" ];then
		restart_proc $1;	
		return;
	fi
		
	if [ "$VERSION" = "true" ];then
		show_version $1;
	fi
	
	if [ "$STATUS" = "true" ];then
		show_status $1;
	fi
	
	if [ "$START" = "true" ];then
		start_proc $1;
	fi

	if [ "$DAEMON" = "true" ] && [ "$EXPAND" != "true" ];then
		start_daemon $1
	fi

	if [ "$DAEMON" = "true" ] && [ "$EXPAND" == "true" ];then
		start_real_daemon $1;
	fi
}

##=========================================================================================================
##=========================================================================================================
##=
##=========================================================================================================
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`pwd`/lib:`pwd`/oracle
export TNS_ADMIN=`pwd`/oracle

KILL=false;
VERSION=false;
STATUS=false;
START=true;
FORCE=false;
CORE=false;
DAEMON=false;
RESTART=false;
EXPAND=false;
WHO=`whoami`;
OWN=`stat -c %U $0`
KILLFAILED=false;
SCRIPT_NAME=$0
ISLIST=false;
STARTSS=false;


while getopts :krvsahfcdpo: OPTION
do
	case $OPTION in
	a)
		EXPAND=true;
		START=false;;
	k)
		KILL=true;
		START=false;;
	v)
		START=false;
		VERSION=true;;
	s)
		START=false;
		STATUS=true;;
	f)
		FORCE=true;;
	r)
		START=false;
		RESTART=true;;
	c)
		START=false;
		CORE=true;;
	o)
		WHO=$OPTARG;;
	d)
		START=false;
		DAEMON=true;;
	p)
		STARTSS=true;;
	h)
		usage;
		exit 0;;
	\?)
		echo "start.sh: invalid option"
		echo "Tyr \"start.sh -h\" for more infomation."
		exit;;
	esac
done
	

shift `expr $OPTIND - 1`;

if [ "$#" = "0" ];then
	echo "start.sh: missing operand." 
	echo "Try \"start.sh -h\" for more infomation."
	exit 1;
fi

if [ $OWN != $WHO ];then
	echo "start.sh:sorry [$WHO], the owner is [$OWN]."
	echo "Add option \"-o $OWN\" to ignore this."
	echo "Try \"start.sh -h\" for more infomation."
	exit 1;
fi

set_ulimit;

for proc in "$@"
do
	num=-1
	proc=`tr A-Z a-z <<< $proc`;
	if [ "$proc" = "list" ];then
		ISLIST=true;
		for((i=0; i<PROC_COUNT; i++))
		do
			show_status $i;
		done	
		exit 0;
		
	fi	
	if [ "$proc" = "all" ];then
		for((i=0; i<PROC_COUNT; i++))
		do
			do_process $i;	
		done	
		exit 0;
	fi
	for((i=0; i<PROC_COUNT; i++))
	do
		if [ "$proc" = "${ALIAS[$i]}" ]; then
			num=$i;	
			break;
		fi
	done
	if [ $num -ne -1 ];then 
		do_process $num;
	else
		echo "start.sh: wrong service name [$proc]. "
		echo "Try \"start.sh -h\" for more infomation."
		exit 1;
	fi
done

if [ "$KILLFAILED" = "true" ];then
	echo "  ----"
	echo "If they are still running , check it later use command \"start.sh list\"";
	echo "Also can use \"start.sh -kf SERVICES\" to kill them immediately"
fi

exit 0;
posted @ 2023-09-06 16:53  平凡的运维之路  阅读(32)  评论(0)    收藏  举报  来源