#!/bin/bash
#
# lnet	This shell script takes care of starting and stopping
#       the lnet (Lustre networking) services.
#
# chkconfig: - 59 76
# description:  Part of the lustre file system.
# probe: true
# config: /etc/sysconfig/lustre

# Source function library.
[ -f /etc/rc.d/init.d/functions ] && . /etc/rc.d/init.d/functions

# Source networking configuration and check that networking is up.
[ -f /etc/sysconfig/network ] && . /etc/sysconfig/network && \
[ "${NETWORKING}" = "no" ] && exit 0

# Check for and source configuration file otherwise set defaults
[ -f /etc/sysconfig/lnet ] && . /etc/sysconfig/lnet

declare -r TOP_MODULES=(	\
	obdecho			\
	llite			\
	lustre			\
	osc			\
	lov			\
	mds			\
	mdc			\
	mgs			\
	mgc			\
	ost			\
	obdfilter		\
	lquota			\
	ptlrpc			\
)
declare -r BOTTOM_MODULES=(	\
	ksocklnd		\
	kqswlnd			\
	ko2iblnd		\
	fsfilt_ldiskfs		\
	obdclass		\
	lnet			\
	lvfs			\
	libcfs			\
	ldiskfs			\
)

declare -r awkprog='BEGIN { rc = -1 }
		       { if ( $1 == module_name ) { rc = $3; exit; } }
		    END { print rc }'

# Usage: run_preexec_check [ start | restart | condrestart ]
# The single parameter will be passed to the PREEXEC_SCRIPT
run_preexec_check ()
{
	if [ -n "$PREEXEC_CHECK" ] && ! $PREEXEC_CHECK ; then
		echo "Pre-exec check \"$PREEXEC_CHECK\" failed.  Aborting."
		exit 1
	fi

	if [ -n "$PREEXEC_SCRIPT" ] && ! "$PREEXEC_SCRIPT" "$1" ; then
		echo "Pre-exec script \"$PREEXEC_SCRIPT\" failed.  Aborting."
		exit 1
	fi
}

# Usage: run_postexec_check [ start | restart | condrestart ]
# The single parameter will be passed to the POSTEXEC_SCRIPT
run_postexec_check ()
{
	if [ -n "$POSTEXEC_CHECK" ] && ! $POSTEXEC_CHECK ; then
		echo "Post-exec check \"$POSTEXEC_CHECK\" failed.  Aborting."
		exit 1
	fi

	if [ -n "$POSTEXEC_SCRIPT" ] && ! "$POSTEXEC_SCRIPT" "$1" ; then
		echo "Post-exec script \"$POSTEXEC_SCRIPT\" failed.  Aborting."
		exit 1
	fi
}

remove_modules ()
{
	local modules="${@}"
	local ref_cnt

	for mod in $modules; do
		ref_cnt=`/sbin/lsmod | awk "$awkprog" "module_name=$mod"`
		if [ $ref_cnt -lt 0 ]; then
			# module not loaded, skip it
			continue
		fi
		if [ $ref_cnt -gt 0 ]; then
			# module in use.  maybe it just needs a few seconds
			# after removal of previous modules.
			sleep 5
			ref_cnt=`/sbin/lsmod | awk "$awkprog" module_name=$mod`
		fi
		if [ $ref_cnt -eq 0 ]; then
			# unload the module
			echo "Removing module $mod"
			/sbin/rmmod $mod
			if [ $? -ne 0 ]; then
				echo "ERROR: Failed to remove module $mod."
				return 1
			fi
		else
			# boo!  module still in use.
			echo "ERROR: Module $mod has non-zero reference count."
			return 1
		fi
	done

	return 0
}

stop_lnet ()
{
	local errmsg=`/usr/sbin/lctl network unconfigure 2>&1`
	if [ $? -gt 0 ]; then
		# The following error message means that lnet is already
		# unconfigured, and the modules are not loaded.
		echo $errmsg | grep "LNET unconfigure error 19" > /dev/null
		if [ $? -gt 0 ]; then
			return 0
		else
			echo "$errmsg"
			return 1
		fi
	fi
	return 0
}

status ()
{
	old_nullglob="`shopt -p nullglob`"
	shopt -u nullglob

	STATE="stopped"
	# LSB compliance - return 3 if service is not running
	# Lustre-specific returns
	# 150 - partial startup
	# 151 - health_check unhealthy
	# 152 - LBUG
	RETVAL=3
	egrep -q "lnet" /proc/modules && STATE="loaded"

	# check for any routes - on a portals router this is the only thing
	[ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0

	# check if this is a router
	if [ -d /proc/sys/lnet ]; then
		ROUTER="`cat /proc/sys/lnet/routes | head -1 | grep -i -c \"Routing enabled\"`"
		if [[ ! -z ${ROUTER} && ${ROUTER} -ge 1 ]]; then
			STATE="running"
			RETVAL=0
		fi
	fi

	# check for error in health_check
	HEALTH="/proc/fs/lustre/health_check"
	[ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=1

	# check for LBUG
	[ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152

	echo $STATE
	eval $old_nullglob
}

# See how we were called.
case "$1" in
  start)
	run_preexec_check "start"
	touch /var/lock/subsys/lnet
	modprobe lnet || exit 1
	lctl network up || exit 1
	run_postexec_check "start"
	;;
  stop)
	run_preexec_check "stop"
	remove_modules ${TOP_MODULES[*]} || exit 1
	stop_lnet || exit 1
	remove_modules ${BOTTOM_MODULES[*]} || exit 1
	rm -f /var/lock/subsys/lnet
	run_postexec_check "stop"
	;;
  status)
	status
	;;
  restart)
	$0 stop
	$0 start
	;;
  reload)
	touch /var/lock/subsys/lnet
	;;
  probe)
	if [ ! -f /var/lock/subsys/lnet ] ; then
	  echo $"start"; exit 0
	fi
	;;
  condrestart)
	[ -f /var/lock/subsys/lnet ] && {
		$0 stop
		$0 start
	}
	;;
  *)
	echo $"Usage: lustre {start|stop|status|restart|reload|condrestart}"
	exit 1
esac

exit 0
