version 2.17 with nifty status screen.
This commit is contained in:
parent
9e4a89e95f
commit
7ae3f7c512
170
ppss.sh
170
ppss.sh
@ -38,7 +38,7 @@ trap 'kill_process; ' INT
|
||||
|
||||
# Setting some vars. Do not change.
|
||||
SCRIPT_NAME="Distributed Parallel Processing Shell Script"
|
||||
SCRIPT_VERSION="2.14"
|
||||
SCRIPT_VERSION="2.17"
|
||||
|
||||
# The first argument to this script is always the 'mode'.
|
||||
MODE="$1"
|
||||
@ -68,11 +68,14 @@ PROCESSORS=""
|
||||
SSH_SERVER="" # Remote server or 'master'.
|
||||
SSH_KEY="" # SSH key for ssh account.
|
||||
SSH_KNOWN_HOSTS=""
|
||||
SSH_SOCKET="/tmp/PPSS-ssh-socket" # Multiplex multiple SSH connections over 1 master.
|
||||
SSH_SOCKET="./PPSS_SSH_SOCKET" # Multiplex multiple SSH connections over 1 master.
|
||||
SSH_OPTS="-o BatchMode=yes -o ControlPath=$SSH_SOCKET \
|
||||
-o GlobalKnownHostsFile=./known_hosts \
|
||||
-o ControlMaster=auto \
|
||||
-o Cipher=blowfish \
|
||||
-o ConnectTimeout=15 "
|
||||
|
||||
# Blowfish is faster but still secure.
|
||||
SSH_MASTER_PID=""
|
||||
|
||||
PPSS_HOME_DIR="ppss"
|
||||
@ -84,7 +87,7 @@ SECURE_COPY="1" # If set, use SCP, Otherwise, use cp.
|
||||
REMOTE_OUTPUT_DIR="" # Remote directory to which output must be uploaded.
|
||||
SCRIPT="" # Custom user script that is executed by ppss.
|
||||
ITEM_ESCAPED=""
|
||||
|
||||
NODE_STATUS="status.txt"
|
||||
|
||||
showusage () {
|
||||
|
||||
@ -174,7 +177,7 @@ showusage () {
|
||||
echo
|
||||
echo -e "Running PPSS based on a configuration file."
|
||||
echo
|
||||
echo -e "$0 node -C config.cfg"
|
||||
echo -e "$0 standalone -C config.cfg"
|
||||
echo
|
||||
echo -e "Running PPSS on a client as part of a cluster."
|
||||
echo
|
||||
@ -215,6 +218,7 @@ kill_process () {
|
||||
|
||||
exec_cmd () {
|
||||
|
||||
|
||||
CMD="$1"
|
||||
|
||||
if [ ! -z "$SSH_SERVER" ] && [ "$SECURE_COPY" == "1" ]
|
||||
@ -243,16 +247,21 @@ check_for_interrupt () {
|
||||
does_file_exist "$STOP_SIGNAL"
|
||||
if [ "$?" == "0" ]
|
||||
then
|
||||
set_status "STOPPED"
|
||||
log INFO "STOPPING job. Stop signal found."
|
||||
STOP="1"
|
||||
return 1
|
||||
fi
|
||||
|
||||
does_file_exist "$PAUSE_SIGNAL"
|
||||
if [ "$?" == "0" ]
|
||||
then
|
||||
set_status "PAUZED"
|
||||
log INFO "PAUSE: sleeping for $PAUSE_DELAY SECONDS."
|
||||
sleep $PAUSE_DELAY
|
||||
check_for_interrupt
|
||||
else
|
||||
set_status "RUNNING"
|
||||
fi
|
||||
}
|
||||
|
||||
@ -476,10 +485,12 @@ done
|
||||
|
||||
display_header () {
|
||||
|
||||
log INFO "---------------------------------------------------------"
|
||||
log INFO "========================================================="
|
||||
log INFO " |P|P|S|S| "
|
||||
log INFO "$SCRIPT_NAME version $SCRIPT_VERSION"
|
||||
log INDO "---------------------------------------------------------"
|
||||
log INFO "Hostname: $HOSTNAME"
|
||||
log INFO "========================================================="
|
||||
log INFO "Hostname:\t$HOSTNAME"
|
||||
log INFO "---------------------------------------------------------"
|
||||
}
|
||||
|
||||
|
||||
@ -516,6 +527,8 @@ init_vars () {
|
||||
|
||||
touch $RUNNING_SIGNAL
|
||||
|
||||
set_status "RUNNING"
|
||||
|
||||
if [ -z "$MAX_NO_OF_RUNNING_JOBS" ]
|
||||
then
|
||||
get_no_of_cpus $HYPERTHREADING
|
||||
@ -569,6 +582,19 @@ init_vars () {
|
||||
fi
|
||||
}
|
||||
|
||||
get_status () {
|
||||
|
||||
STATUS=`cat "$NODE_SATUS"`
|
||||
echo "$STATUS"
|
||||
}
|
||||
|
||||
set_status () {
|
||||
|
||||
STATUS="$1"
|
||||
echo "$HOSTNAME $STATUS" > "$NODE_STATUS"
|
||||
}
|
||||
|
||||
|
||||
expand_str () {
|
||||
|
||||
STR=$1
|
||||
@ -587,13 +613,12 @@ log () {
|
||||
|
||||
TYPE="$1"
|
||||
MESG="$2"
|
||||
TMP_LOG=""
|
||||
TYPE_LENGTH=6
|
||||
TYPE_LENGTH=5
|
||||
|
||||
TYPE_EXP=`expand_str "$TYPE"`
|
||||
|
||||
DATE=`date +%b\ %d\ %H:%M:%S`
|
||||
PREFIX="$DATE: ${TYPE_EXP:0:$TYPE_LENGTH} -"
|
||||
PREFIX="$DATE: ${TYPE_EXP:0:$TYPE_LENGTH}"
|
||||
|
||||
LOG_MSG="$PREFIX $MESG"
|
||||
|
||||
@ -623,6 +648,7 @@ check_status () {
|
||||
|
||||
erase_ppss () {
|
||||
|
||||
|
||||
echo "Are you realy sure you want to erase PPSS from all nades!? (YES/NO)"
|
||||
read YN
|
||||
|
||||
@ -631,8 +657,8 @@ erase_ppss () {
|
||||
for NODE in `cat $NODES_FILE`
|
||||
do
|
||||
log INFO "Erasing PPSS homedir $PPSS_HOME_DIR from node $NODE."
|
||||
ssh -q $SSH_KEY $USER@$NODE "./$PPSS_HOME_DIR/$0 kill"
|
||||
ssh -q $SSH_KEY $USER@$NODE "rm -rf $PPSS_HOME_DIR"
|
||||
ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "./$PPSS_HOME_DIR/$0 kill"
|
||||
ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "rm -rf $PPSS_HOME_DIR"
|
||||
done
|
||||
else
|
||||
log INFO "Aborting.."
|
||||
@ -644,35 +670,54 @@ deploy () {
|
||||
|
||||
NODE="$1"
|
||||
|
||||
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=socket-%h \
|
||||
-o GlobalKnownHostsFile=./known_hosts \
|
||||
-o ControlMaster=auto \
|
||||
-o Cipher=blowfish \
|
||||
-o ConnectTimeout=5 "
|
||||
|
||||
ERROR=0
|
||||
set_error () {
|
||||
|
||||
if [ ! "$1" == "0" ]
|
||||
then
|
||||
ERROR=$1
|
||||
ERROR=1
|
||||
fi
|
||||
}
|
||||
|
||||
ssh -q -o ConnectTimeout=5 $SSH_KEY $USER@$NODE exit 0
|
||||
set_error "$?"
|
||||
if [ ! "$ERROR" == "0" ]
|
||||
then
|
||||
log INFO "Cannot connect to node $NODE."
|
||||
return
|
||||
fi
|
||||
|
||||
ssh -N -M $SSH_OPTS_NODE $SSH_KEY $USER@$NODE &
|
||||
SSH_PID=$!
|
||||
|
||||
KEY=`echo $SSH_KEY | cut -d " " -f 2`
|
||||
|
||||
ssh -q $SSH_OPTS $SSH_KEY $USER@$NODE "mkdir $PPSS_HOME_DIR >> /dev/null 2>&1"
|
||||
scp -q $SSH_OPTS $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
sleep 1.1
|
||||
|
||||
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "mkdir $PPSS_HOME_DIR >> /dev/null 2>&1"
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
scp -q $SSH_OPTS $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
scp -q $SSH_OPTS $SSH_KEY $CONFIG $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $CONFIG $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
scp -q $SSH_OPTS $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
if [ ! -z "$SCRIPT" ]
|
||||
then
|
||||
scp -q $SSH_OPTS $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
fi
|
||||
|
||||
if [ ! -z "$INPUT_FILE" ]
|
||||
then
|
||||
scp -q $SSH_OPTS $SSH_KEY $INPUT_FILE $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $INPUT_FILE $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
fi
|
||||
|
||||
@ -682,6 +727,8 @@ deploy () {
|
||||
else
|
||||
log INFO "PPSS failed to install on $NODE."
|
||||
fi
|
||||
|
||||
kill $SSH_PID
|
||||
}
|
||||
|
||||
deploy_ppss () {
|
||||
@ -699,12 +746,14 @@ deploy_ppss () {
|
||||
then
|
||||
log INFO "ERROR - nodes require a key file."
|
||||
cleanup
|
||||
set_status "ERROR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -e "$SCRIPT" ] && [ ! -z "$SCRIPT" ]
|
||||
then
|
||||
log INFO "ERROR - script $SCRIPT not found."
|
||||
set_status "ERROR"
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
@ -717,7 +766,7 @@ deploy_ppss () {
|
||||
else
|
||||
for NODE in `cat $NODES_FILE`
|
||||
do
|
||||
deploy "$NODE"
|
||||
deploy "$NODE" &
|
||||
sleep 0.1
|
||||
done
|
||||
fi
|
||||
@ -737,6 +786,7 @@ test_server () {
|
||||
# Testing if the remote server works as expected.
|
||||
if [ ! -z "$SSH_SERVER" ]
|
||||
then
|
||||
|
||||
exec_cmd "date >> /dev/null"
|
||||
check_status "$?" "$FUNCNAME" "Server $SSH_SERVER could not be reached"
|
||||
|
||||
@ -776,18 +826,23 @@ get_no_of_cpus () {
|
||||
then
|
||||
NUMBER=`sysctl -a hw | grep -w logicalcpu | awk '{ print $2 }'`
|
||||
got_cpu_info "$?"
|
||||
|
||||
elif [ "$ARCH" == "FreeBSD" ]
|
||||
then
|
||||
NUMBER=`sysctl hw.ncpu | awk '{ print $2 }'`
|
||||
got_cpu_info "$?"
|
||||
|
||||
else
|
||||
NUMBER=`grep ^processor $CPUINFO | wc -l`
|
||||
got_cpu_info "$?"
|
||||
|
||||
fi
|
||||
log INFO "Found $NUMBER logic processors."
|
||||
|
||||
elif [ "$HPT" == "no" ]
|
||||
then
|
||||
log INFO "Hyperthreading is disabled."
|
||||
|
||||
if [ "$ARCH" == "Linux" ]
|
||||
then
|
||||
PHYSICAL=`grep 'physical id' $CPUINFO`
|
||||
@ -800,6 +855,7 @@ get_no_of_cpus () {
|
||||
else
|
||||
log INFO "Found $PHYSICAL physical CPUs."
|
||||
fi
|
||||
|
||||
TMP=`grep 'core id' $CPUINFO`
|
||||
if [ "$?" == "0" ]
|
||||
then
|
||||
@ -836,6 +892,7 @@ get_no_of_cpus () {
|
||||
MAX_NO_OF_RUNNING_JOBS=$NUMBER
|
||||
else
|
||||
log INFO "$FUNCNAME ERROR - number of CPUs not obtained."
|
||||
set_status "ERROR"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
@ -1003,6 +1060,12 @@ lock_item () {
|
||||
log DEBUG "Trying to lock item $ITEM - $ITEM_LOCK_FILE."
|
||||
exec_cmd "mkdir $ITEM_LOCK_FILE >> /dev/null 2>&1"
|
||||
ERROR="$?"
|
||||
|
||||
if [ "$ERROR" == "$?" ]
|
||||
then
|
||||
exec_cmd "touch $ITEM_LOCK_FILE/$HOSTNAME" # Record that item is claimed by node x.
|
||||
fi
|
||||
|
||||
return "$ERROR"
|
||||
fi
|
||||
}
|
||||
@ -1035,6 +1098,7 @@ get_all_items () {
|
||||
if [ ! -e "$INPUT_FILE" ]
|
||||
then
|
||||
log INFO "ERROR - input file $INPUT_FILE does not exist."
|
||||
set_status "ERROR"
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
@ -1292,6 +1356,19 @@ start_all_workers () {
|
||||
start_single_worker
|
||||
((i++))
|
||||
done
|
||||
|
||||
}
|
||||
|
||||
get_status_of_node () {
|
||||
|
||||
NODE="$1"
|
||||
STATUS=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$NODE_STATUS"`
|
||||
ERROR="$?"
|
||||
if [ ! "$ERROR" == "0" ]
|
||||
then
|
||||
STATUS="Could not connect to $NODE."
|
||||
fi
|
||||
echo "$STATUS"
|
||||
}
|
||||
|
||||
show_status () {
|
||||
@ -1310,10 +1387,35 @@ show_status () {
|
||||
fi
|
||||
|
||||
PROCESSED=`exec_cmd "ls -1 $ITEM_LOCK_DIR | wc -l"` 2>&1 >> /dev/null
|
||||
STATUS=$((100 * $PROCESSED / $ITEMS))
|
||||
TMP_STATUS=$((100 * $PROCESSED / $ITEMS))
|
||||
|
||||
log INFO "$STATUS percent complete."
|
||||
log INFO "Status:\t\t$TMP_STATUS percent complete."
|
||||
|
||||
if [ ! -z $NODES_FILE ]
|
||||
then
|
||||
TMP_NO=`cat $NODES_FILE | wc -l`
|
||||
log INFO "Nodes:\t $TMP_NO"
|
||||
fi
|
||||
|
||||
|
||||
log INFO "---------------------------------------------------------"
|
||||
HEADER=`echo IP-address Hostname Processed Status | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'`
|
||||
log INFO "$HEADER"
|
||||
log INFO "---------------------------------------------------------"
|
||||
PROCESSED=0
|
||||
for x in `cat $NODES_FILE`
|
||||
do
|
||||
NODE=`get_status_of_node "$x" | awk '{ print $1 }'`
|
||||
RES=`exec_cmd "grep $NODE ~/$JOB_LOG_DIR/* | wc -l"`
|
||||
let PROCESSED=$PROCESSED+$RES
|
||||
STATUS=`get_status_of_node "$x" | awk '{ print $2 }'`
|
||||
LINE=`echo "$x $NODE $RES $STATUS" | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'`
|
||||
log INFO "$LINE"
|
||||
|
||||
done
|
||||
log INFO "---------------------------------------------------------"
|
||||
LINE=`echo $PROCESSED | awk '{ printf ("Total processed: % 29s\n",$1) }'`
|
||||
log INFO "$LINE"
|
||||
}
|
||||
|
||||
|
||||
@ -1450,13 +1552,21 @@ do
|
||||
log DEBUG "Sleeping $INTERVAL seconds."
|
||||
sleep $INTERVAL
|
||||
else
|
||||
echo -en "\033[1B"
|
||||
log INFO "There are no more running jobs, so we must be finished."
|
||||
echo -en "\033[1B"
|
||||
log INFO "Killing listener and remainig processes."
|
||||
log INFO "Dying processes may display an error message."
|
||||
kill_process
|
||||
fi
|
||||
if [ "$STOP" == "1" ] || [ ! "$PERCENT" == "100" ]
|
||||
then
|
||||
set_status "STOPPED"
|
||||
elif [ "$PERCENT" == "100" ]
|
||||
then
|
||||
set_status "FINISHED"
|
||||
fi
|
||||
|
||||
echo -en "\033[1B"
|
||||
log INFO "There are no more running jobs, so we must be finished."
|
||||
echo -en "\033[1B"
|
||||
log INFO "Killing listener and remainig processes."
|
||||
log INFO "Dying processes may display an error message."
|
||||
kill_process
|
||||
fi
|
||||
done
|
||||
|
||||
# Exit after all processes have finished.
|
||||
|
Loading…
Reference in New Issue
Block a user