some distributed mode bug fixes

This commit is contained in:
louwrentius 2011-08-29 01:09:02 +00:00
parent eaa18282d1
commit 55f4b1f2e2
1 changed files with 148 additions and 44 deletions

192
ppss
View File

@ -75,9 +75,9 @@ LISTENER_PID=""
IFS_BACKUP="$IFS" IFS_BACKUP="$IFS"
CPUINFO="/proc/cpuinfo" CPUINFO="/proc/cpuinfo"
PROCESSORS="" PROCESSORS=""
START_KEY="$RANDOM$RANDOM$RANDOM$RANDOM" # If this key is received by listener, start a new process START_KEY="start-$RANDOM$RANDOM$RANDOM$RANDOM" # If this key is received by listener, start a new process
FAIL_KEY="$RANDOM$RANDOM$RANDOM$RANDOM" # if this key is received by listener, increase error count FAIL_KEY="fail-$RANDOM$RANDOM$RANDOM$RANDOM" # if this key is received by listener, increase error count
KILL_KEY="$RANDOM$RANDOM$RANDOM$RANDOM" # This is a signal to stop immediately and kill PPSS KILL_KEY="kill-$RANDOM$RANDOM$RANDOM$RANDOM" # This is a signal to stop immediately and kill PPSS
QUEUE="" QUEUE=""
INOTIFY="" INOTIFY=""
RECURSION="1" # all running processes. RECURSION="1" # all running processes.
@ -96,6 +96,7 @@ ENABLE_INPUT_LOCK="0"
PROCESSING_TIME="" PROCESSING_TIME=""
NODE_ID="NODE_ID" NODE_ID="NODE_ID"
USE_MD5="0" USE_MD5="0"
RANDOMIZE="0"
SSH_SERVER="" # Remote server or 'master'. SSH_SERVER="" # Remote server or 'master'.
SSH_KEY="" # SSH key for ssh account. SSH_KEY="" # SSH key for ssh account.
@ -285,10 +286,6 @@ showusage_long () {
echo -e " used. If this is not prefered, this can be disabled with this option " echo -e " used. If this is not prefered, this can be disabled with this option "
echo -e " Only files within the specified directory will be processed." echo -e " Only files within the specified directory will be processed."
echo echo
echo -e "--no-recursion|-r By default, recursion of directories is enabled when the -d option is "
echo -e " used. If this is not prefered, this can be disabled with this option."
echo -e " Only files within the specified directory will be processed."
echo
echo -e "--md5|-M Use MD5 to create unique file names for locking and log file names." echo -e "--md5|-M Use MD5 to create unique file names for locking and log file names."
echo -e " PPSS strips al non [:alnum:] characters of an item string and this may" echo -e " PPSS strips al non [:alnum:] characters of an item string and this may"
echo -e " cause collisions. String ABC!@# and ABC^&* will become both ABC___" echo -e " cause collisions. String ABC!@# and ABC^&* will become both ABC___"
@ -335,6 +332,10 @@ showusage_long () {
echo echo
echo -e "--script | -S Script to run on the node. PPSS must copy this script to the node." echo -e "--script | -S Script to run on the node. PPSS must copy this script to the node."
echo echo
echo -e "--randomize | -R Randomise which items to process by the client in distributed mode."
echo -e " This makes sure that with many nodes, some clients spend their time"
echo -e " trying to get a lock on an item."
echo
echo -e "Example: encoding some wav files to mp3 using lame:" echo -e "Example: encoding some wav files to mp3 using lame:"
echo echo
echo -e "$0 -c 'lame ' -d /path/to/wavfiles -j " echo -e "$0 -c 'lame ' -d /path/to/wavfiles -j "
@ -375,20 +376,19 @@ exec_cmd () {
then then
if [ -z "$NOMP" ] if [ -z "$NOMP" ]
then then
# log DEBUG "REMOTE EXEC" #log DEBUG "REMOTE EXEC"
# log DEBUG "$USER@$SSH_SERVER $CMD"
ssh $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER $CMD ssh $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER $CMD
STATUS=$? STATUS=$?
elif [ "$NOMP" == "1" ] elif [ "$NOMP" == "1" ]
then then
# log DEBUG "REMOTE EXEC NO MP" #log DEBUG "REMOTE EXEC NO MP"
ssh $SSH_OPTS_NOMP $SSH_KEY $USER@$SSH_SERVER $CMD ssh $SSH_OPTS_NOMP $SSH_KEY $USER@$SSH_SERVER $CMD
STATUS=$? STATUS=$?
fi fi
else else
eval "$CMD" eval "$CMD"
STATUS=$? STATUS=$?
# log DEBUG "LOCAL EXEC - status is $STATUS" log DEBUG "LOCAL EXEC - status is $STATUS"
fi fi
return $STATUS return $STATUS
} }
@ -444,7 +444,10 @@ cleanup () {
do do
if [ "$x" == "$MODE" ] if [ "$x" == "$MODE" ]
then then
rm -rf "$PPSS_DIR" if [ "$x" != "node" ]
then
rm -rf "$PPSS_DIR"
fi
fi fi
done done
@ -462,6 +465,11 @@ cleanup () {
then then
rm -rf "$SSH_SOCKET" rm -rf "$SSH_SOCKET"
fi fi
if [ ! -z "$SSH_MASTER_PID" ]
then
kill "$SSH_MASTER_PID"
fi
} }
add_var_to_config () { add_var_to_config () {
@ -712,6 +720,10 @@ process_arguments () {
SECURE_COPY=0 SECURE_COPY=0
add_var_to_config SECURE_COPY "$SECURE_COPY" add_var_to_config SECURE_COPY "$SECURE_COPY"
shift 1 ;; shift 1 ;;
--randomize |-R )
RANDOMIZE=1
add_var_to_config RANDOMIZE "$RANDOMIZE"
shift 1 ;;
--outputdir|-o ) --outputdir|-o )
REMOTE_OUTPUT_DIR="$2" REMOTE_OUTPUT_DIR="$2"
add_var_to_config REMOTE_OUTPUT_DIR "$REMOTE_OUTPUT_DIR" add_var_to_config REMOTE_OUTPUT_DIR "$REMOTE_OUTPUT_DIR"
@ -750,7 +762,6 @@ process_arguments () {
USER="$2" USER="$2"
add_var_to_config USER "$USER" add_var_to_config USER "$USER"
shift 2 ;; shift 2 ;;
--version|-v ) --version|-v )
echo "" echo ""
echo "$SCRIPT_NAME version $SCRIPT_VERSION" echo "$SCRIPT_NAME version $SCRIPT_VERSION"
@ -1117,12 +1128,19 @@ init_vars () {
upload_status () { upload_status () {
scp -q $SSH_OPTS $SSH_KEY $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/ #log DEBUG "scp $SSH_OPTS $SSH_KEY $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/"
if [ "$?" == "0" ] # scp -v $SSH_OPTS $SSH_KEY $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/ >> scp.tmp 2>&1
if [ -e "$NODE_STATUS_FILE" ]
then then
log DEBUG "Uploaded status to server ok." scp -vv -o GlobalKnownHostsFile=./known_hosts -i ppss-key.dsa $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/ >> scp.tmp 2>&1
if [ "$?" == "0" ]
then
log DEBUG "Uploaded status to server ok."
else
log DEBUG "Uploaded status to server failed."
fi
else else
log DEBUG "Uploaded status to server failed." log DEBUG "Status file not found thus not uploaded."
fi fi
} }
@ -1131,9 +1149,19 @@ set_status () {
if [ ! -z "$SSH_SERVER" ] if [ ! -z "$SSH_SERVER" ]
then then
STATUS="$1" STATUS="$1"
NO_PROCESSED=$(wc -l "$LIST_OF_PROCESSED_ITEMS" | awk '{ print $1 }' ) if [ -e "$LIST_OF_PROCESSED_ITEMS" ]
then
NO_PROCESSED=$(wc -l "$LIST_OF_PROCESSED_ITEMS" | awk '{ print $1 }' )
else
NO_PROCESSED="0"
fi
NODE=`cat $PPSS_DIR/$NODE_ID` NODE=`cat $PPSS_DIR/$NODE_ID`
FAILED="$2" FAILED="$2"
if [ -z "$FAILED" ]
then
FAILED=0
fi
echo "$NODE $HOSTNAME $STATUS $NO_PROCESSED" "$FAILED" > "$NODE_STATUS_FILE" echo "$NODE $HOSTNAME $STATUS $NO_PROCESSED" "$FAILED" > "$NODE_STATUS_FILE"
upload_status upload_status
@ -1157,6 +1185,13 @@ check_status () {
erase_ppss () { erase_ppss () {
SSH_SOCKET="ppss_ssh_socket-$NODE"
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=$SSH_SOCKET \
-o GlobalKnownHostsFile=./known_hosts \
-o ControlMaster=auto \
-o Cipher=blowfish \
-o ConnectTimeout=5 "
echo "Are you realy sure you want to erase PPSS from all nodes!? (YES/NO)" echo "Are you realy sure you want to erase PPSS from all nodes!? (YES/NO)"
read YN read YN
@ -1166,7 +1201,7 @@ erase_ppss () {
for NODE in `cat $NODES_FILE` for NODE in `cat $NODES_FILE`
do do
log DSPLY "Erasing PPSS homedir $PPSS_HOME_DIR from node $NODE." log DSPLY "Erasing PPSS homedir $PPSS_HOME_DIR from node $NODE."
ssh -q $SSH_KEY $SSH_OPTS $USER@$NODE "rm -rf $PPSS_HOME_DIR" ssh -q $SSH_KEY $SSH_OPTS_NODE $USER@$NODE "rm -rf $PPSS_HOME_DIR"
done done
else else
log DSPLY "Aborting.." log DSPLY "Aborting.."
@ -1277,6 +1312,12 @@ deploy () {
-o Cipher=blowfish \ -o Cipher=blowfish \
-o ConnectTimeout=5 " -o ConnectTimeout=5 "
SSH_OPTS_SLAVE="-o BatchMode=yes -o ControlPath=$SSH_SOCKET \
-o GlobalKnownHostsFile=./known_hosts \
-o ControlMaster=no \
-o Cipher=blowfish \
-o ConnectTimeout=5 "
ERROR=0 ERROR=0
set_error () { set_error () {
@ -1288,36 +1329,38 @@ deploy () {
ERROR=1 ERROR=1
fi fi
} }
if [ ! -e "$SSH_SOCKET" ]
ssh -N -M $SSH_OPTS_NODE $SSH_KEY $USER@$NODE & then
SSH_PID=$! ssh -q -N $SSH_OPTS_NODE $SSH_KEY $USER@$NODE &
SSH_PID=$!
fi
is_screen_installed "$NODE" is_screen_installed "$NODE"
KEY=`echo $SSH_KEY | cut -d " " -f 2` KEY=`echo $SSH_KEY | cut -d " " -f 2`
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && mkdir -p $PPSS_HOME_DIR && mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR && mkdir -p $PPSS_HOME_DIR/ITEM_LOCK_DIR >> /dev/null 2>&1" ssh -q $SSH_OPTS_SLAVE $SSH_KEY $USER@$NODE "cd ~ && mkdir -p $PPSS_HOME_DIR && mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR && mkdir -p $PPSS_HOME_DIR/ITEM_LOCK_DIR >> /dev/null 2>&1"
set_error $? set_error $?
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && cd $PPSS_HOME_DIR && cd $PPSS_DIR && echo $NODE > $NODE_ID" ssh -q $SSH_OPTS_SLAVE $SSH_KEY $USER@$NODE "cd ~ && cd $PPSS_HOME_DIR && cd $PPSS_DIR && echo $NODE > $NODE_ID"
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY $CONFIG $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY $CONFIG $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
if [ ! -z "$SCRIPT" ] if [ ! -z "$SCRIPT" ]
then then
scp -q $SSH_OPTS_NODE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
fi fi
if [ ! -z "$INPUT_FILE" ] if [ ! -z "$INPUT_FILE" ]
then then
scp -q $SSH_OPTS_NODE $SSH_KEY $INPUT_FILE $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_SLAVE $SSH_KEY $INPUT_FILE $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
fi fi
@ -1341,7 +1384,7 @@ deploy_ppss () {
exit 1 exit 1
fi fi
exec_cmd "mkdir $PPSS_HOME_DIR/$PPSS_NODE_STATUS" exec_cmd "mkdir -p $PPSS_HOME_DIR/$PPSS_NODE_STATUS"
KEY=`echo $SSH_KEY | cut -d " " -f 2` KEY=`echo $SSH_KEY | cut -d " " -f 2`
if [ -z "$KEY" ] || [ ! -e "$KEY" ] if [ -z "$KEY" ] || [ ! -e "$KEY" ]
@ -1389,6 +1432,19 @@ start_ppss_on_node () {
NODE="$1" NODE="$1"
log DSPLY "Starting PPSS on node $NODE." log DSPLY "Starting PPSS on node $NODE."
ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 -o GlobalKnownHostsFile=./known_hosts "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG" ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 -o GlobalKnownHostsFile=./known_hosts "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG"
if [ ! "$?" == "0" ]
then
log ERROR "PPSS failed to start on node $NODE."
fi
}
init_ssh_server_socket () {
if [ ! -e "$SSH_SOCKET" ]
then
DIR=`dirname $SSH_SOCKET`
mkdir -p "$DIR"
fi
} }
test_server () { test_server () {
@ -1396,20 +1452,15 @@ test_server () {
# Testing if the remote server works as expected. # Testing if the remote server works as expected.
if [ ! -z "$SSH_SERVER" ] if [ ! -z "$SSH_SERVER" ]
then then
if [ ! -e "$SSH_SOCKET" ] init_ssh_server_socket
then
DIR=`dirname $SSH_SOCKET`
mkdir -p "$DIR"
fi
exec_cmd "date >> /dev/null" exec_cmd "date >> /dev/null"
check_status "$?" "$FUNCNAME" "Server $SSH_SERVER could not be reached" check_status "$?" "$FUNCNAME" "Server $SSH_SERVER could not be reached"
ssh -N -M $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER & ssh -N -M $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER &
SSH_MASTER_PID="$!" SSH_MASTER_PID="$!"
log DEBUG "SSH Master pid is $SSH_MASTER_PID" log DEBUG "SSH Master pid is $SSH_MASTER_PID"
log DSPLY "Connected to server: $SSH_SERVER" log INFO "Connected to server: $SSH_SERVER"
does_file_exist "$PPSS_HOME_DIR/$PPSS_DIR" does_file_exist "$PPSS_HOME_DIR/$PPSS_DIR"
if [ ! "$?" = "0" ] && [ ! -z "$SSH_SERVER" ] if [ ! "$?" = "0" ] && [ ! -z "$SSH_SERVER" ]
@ -1420,7 +1471,6 @@ test_server () {
else else
log DEBUG "No remote server specified, assuming stand-alone mode." log DEBUG "No remote server specified, assuming stand-alone mode."
fi fi
} }
get_no_of_cpus () { get_no_of_cpus () {
@ -1675,6 +1725,10 @@ lock_item () {
if [ "$INOTIFY" = "1" ] && [ "$DAEMON" = "1" ] if [ "$INOTIFY" = "1" ] && [ "$DAEMON" = "1" ]
then then
#
# In daemon mode, there is no risk that processes try to process
# the same item. Therefore, locking is not required.
#
return 0 return 0
else else
ITEM="$1" ITEM="$1"
@ -1894,6 +1948,20 @@ get_all_items () {
fi fi
fi fi
if [ "$RANDOMIZE" == "1" ] && [ "$MODE" != "status" ]
then
log DEBUG "Randomizing input file."
IFS_BACK="$IFS"
IFS=$'\n'
TMP_FILE="$PPSS_DIR/TMP-$RANDOM$RANDOM.txt"
for i in `cat $LISTOFITEMS`; do echo "$RANDOM $i"; done | sort | sed -E 's/^[0-9]+ //' > "$TMP_FILE"
mv "$TMP_FILE" "$LISTOFITEMS"
IFS="$IFS_BACK"
else
log DEBUG "Randomisation of input file disabled."
fi
remove_processed_items_from_input_file remove_processed_items_from_input_file
if [ "$DAEMON" == "1" ] if [ "$DAEMON" == "1" ]
@ -1913,6 +1981,19 @@ get_all_items () {
} }
are_all_items_locked () {
SIZE="$1"
NUMBER=`exec_cmd "ls -1 $ITEM_LOCK_DIR | wc -l"`
log DEBUG "$NUMBER of $SIZE items are locked."
if [ "$NUMBER" -ge "$SIZE" ]
then
return 0
else
return 1
fi
}
get_item () { get_item () {
check_for_interrupt check_for_interrupt
@ -1949,6 +2030,18 @@ get_item () {
return 1 return 1
fi fi
#
# Quit if all items have been locked.
#
if are_all_items_locked "$SIZE_OF_INPUT"
then
log DEBUG "All items have been locked."
return 1
else
log DEBUG "There are still unlocked items."
fi
ITEM="$(sed -n $GLOBAL_COUNTER\p $LISTOFITEMS)" ITEM="$(sed -n $GLOBAL_COUNTER\p $LISTOFITEMS)"
if [ -z "$ITEM" ] if [ -z "$ITEM" ]
@ -2710,7 +2803,12 @@ get_status_of_nodes () {
RESULT_FILE="$1" RESULT_FILE="$1"
FAILED=0 FAILED=0
ssh $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/*" > "$RESULT_FILE" 2>&1 ssh -q $SSH_OPTS $SSH_KEY $USER@$SSH_SERVER cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/*" > "$RESULT_FILE" 2>&1
if [ ! "$?" == "0" ]
then
log DSPLY "PPSS has not been started yet on nodes."
return 1
fi
IFS=$'\n' IFS=$'\n'
@ -2747,7 +2845,6 @@ show_status () {
then then
SSH_KEY="-i $SSH_KEY" SSH_KEY="-i $SSH_KEY"
fi fi
get_all_items get_all_items
ITEMS=`wc -l $LISTOFITEMS | awk '{ print $1 }'` ITEMS=`wc -l $LISTOFITEMS | awk '{ print $1 }'`
@ -2755,6 +2852,7 @@ show_status () {
if [ ! -z "$ITEMS" ] && [ ! "$ITEMS" == "0" ] if [ ! -z "$ITEMS" ] && [ ! "$ITEMS" == "0" ]
then then
PROCESSED=`exec_cmd "ls -1 $PPSS_HOME_DIR/$ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null PROCESSED=`exec_cmd "ls -1 $PPSS_HOME_DIR/$ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null
check_status "$?" "Could not get number of processed items."
TMP_STATUS=$((100 * $PROCESSED / $ITEMS)) TMP_STATUS=$((100 * $PROCESSED / $ITEMS))
log DSPLY "Status:\t\t$TMP_STATUS percent complete." log DSPLY "Status:\t\t$TMP_STATUS percent complete."
else else
@ -2774,8 +2872,7 @@ show_status () {
log DSPLY "---------------------------------------------------------" log DSPLY "---------------------------------------------------------"
PROCESSED=0 PROCESSED=0
RESULT_FILE="$RADOM$RANDOM.deleteme" get_status_of_nodes "RESULT_FILE"
get_status_of_nodes "$RESULT_FILE"
} }
@ -2804,7 +2901,7 @@ main () {
else else
for NODE in `cat $NODES_FILE` for NODE in `cat $NODES_FILE`
do do
start_ppss_on_node "$NODE" start_ppss_on_node "$NODE" &
done done
fi fi
cleanup cleanup
@ -2822,6 +2919,7 @@ main () {
LOGFILE=/dev/null LOGFILE=/dev/null
display_header display_header
log DSPLY "Stopping PPSS on all nodes." log DSPLY "Stopping PPSS on all nodes."
test_server
exec_cmd "touch $STOP_SIGNAL" exec_cmd "touch $STOP_SIGNAL"
cleanup cleanup
;; ;;
@ -2849,6 +2947,12 @@ main () {
;; ;;
deploy ) deploy )
LOGFILE=ppss-deploy.txt LOGFILE=ppss-deploy.txt
if [ -e "$LOGFILE" ]
then
rm "$LOGFILE"
fi
init_ssh_server_socket
display_header display_header
log DSPLY "Deploying PPSS on nodes. See ppss-deploy.txt for details." log DSPLY "Deploying PPSS on nodes. See ppss-deploy.txt for details."
deploy_ppss deploy_ppss