From 1da1de4b18ae78d314e865caf59040da66feebc8 Mon Sep 17 00:00:00 2001 From: Louwrentius Date: Sun, 20 Sep 2009 09:04:23 +0000 Subject: [PATCH] Improved distributed ppss --- ppss.sh | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/ppss.sh b/ppss.sh index f64df39..ab471ad 100755 --- a/ppss.sh +++ b/ppss.sh @@ -38,7 +38,7 @@ trap 'kill_process; ' INT # Setting some vars. Do not change. SCRIPT_NAME="Distributed Parallel Processing Shell Script" -SCRIPT_VERSION="2.20" +SCRIPT_VERSION="2.22" # The first argument to this script is always the 'mode'. MODE="$1" @@ -48,7 +48,7 @@ shift # export PPSS_DIR=/path/to/workingdir if [ -z "$PPSS_DIR" ] then - PPSS_DIR="./ppss" + PPSS_DIR="ppss" fi if [ ! -e "$PPSS_DIR" ] @@ -159,7 +159,7 @@ showusage () { echo echo -e "The following options are used for distributed execution of PPSS." echo - echo -e "--server | -s Specifies the SSH server that is used for communication between nodes." + echo -e "--master | -m Specifies the SSH server that is used for communication between nodes." echo -e " Using SSH, file locks are created, informing other nodes that an item " echo -e " is locked. Also, often items, such as files, reside on this host. SCP " echo -e " is used to transfer files from this host to nodes for local procesing." @@ -167,7 +167,7 @@ showusage () { echo -e "--node | -n File containig a list of nodes that act as PPSS clients. One IP / DNS " echo -e " name per line." echo - echo -e "--key | -k The SSH key that a node uses to connect to the server." + echo -e "--key | -k The SSH key that a node uses to connect to the master." echo echo -e "--known-hosts | -K The file that contains the server public key. Can often be found on " echo -e " hosts that already once connected to the server. See the file " @@ -243,11 +243,13 @@ exec_cmd () { CMD="$1" - if [ ! -z "$SSH_SERVER" ] && [ "$SECURE_COPY" == "1" ] + if [ ! -z "$SSH_SERVER" ] then ssh $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER $CMD + return $? else eval "$CMD" + return $? fi } @@ -482,7 +484,7 @@ do shift 2 fi ;; - --server|-s ) + --master|-m ) SSH_SERVER="$2" add_var_to_config SSH_SERVER "$SSH_SERVER" shift 2 @@ -533,7 +535,7 @@ check_for_running_instances () { JOBS=`ps axu | grep -v grep | grep ${USER} | grep -v -i screen | grep ppss.sh | wc -l` #echo "$(date) : ${JOBS}" get_min_jobs - log INFO "Minjobs is $MIN_JOBS" + log DEBUG "Minjobs is $MIN_JOBS" if [ "$JOBS" -gt "$MIN_JOBS" ] then @@ -750,9 +752,15 @@ erase_ppss () { then for NODE in `cat $NODES_FILE` do - log INFO "Erasing PPSS homedir $PPSS_HOME_DIR from node $NODE." - ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "./$PPSS_HOME_DIR/$0 kill" - ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "rm -rf $PPSS_HOME_DIR" + does_file_exist "ppss" + if [ "$?" == "0" ] + then + log INFO "Erasing PPSS homedir $PPSS_HOME_DIR from node $NODE." + ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "./$PPSS_HOME_DIR/$0 kill" + ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "rm -rf $PPSS_HOME_DIR" + else + log INFO "PPSS was not present on node $NODE." + fi done else log INFO "Aborting.." @@ -1464,7 +1472,7 @@ start_all_workers () { get_status_of_node () { NODE="$1" - STATUS=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$NODE_STATUS"` + STATUS=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@$NODE cat "$PPSS_DIR/$NODE_STATUS"` ERROR="$?" if [ ! "$ERROR" == "0" ] then @@ -1508,7 +1516,13 @@ show_status () { for x in `cat $NODES_FILE` do NODE=`get_status_of_node "$x" | awk '{ print $1 }'` - RES=`exec_cmd "grep $NODE ~/$JOB_LOG_DIR/* | wc -l"` + RES=`exec_cmd "grep $NODE ~/$JOB_LOG_DIR/* >> /dev/null 2>&1"` + if [ ! "$ERROR" == "0" ] + then + RES=0 + else + RES=`exec_cmd "grep $NODE ~/$JOB_LOG_DIR/* | wc -l"` + fi let PROCESSED=$PROCESSED+$RES STATUS=`get_status_of_node "$x" | awk '{ print $2 }'` LINE=`echo "$x $NODE $RES $STATUS" | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'`