Resolving lots of bugs, backup.

This commit is contained in:
louwrentius 2011-08-06 18:08:41 +00:00
parent 2946abc3d6
commit 126e71e231
2 changed files with 104 additions and 134 deletions

206
ppss
View File

@ -26,12 +26,12 @@
trap 'kill_process' SIGINT trap 'kill_process' SIGINT
SCRIPT_NAME="Distributed Parallel Processing Shell Script" SCRIPT_NAME="Distributed Parallel Processing Shell Script"
SCRIPT_VERSION="2.86" SCRIPT_VERSION="2.90"
# #
# The first argument to this script can be a mode. # The first argument to this script can be a mode.
# #
MODES="node start config stop pause continue deploy status erase kill ec2" MODES="node start config stop pause continue deploy status erase kill"
for x in $MODES for x in $MODES
do do
if [ "$x" == "$1" ] if [ "$x" == "$1" ]
@ -92,6 +92,7 @@ STAT=""
DAEMON_FILE_AGE="4" DAEMON_FILE_AGE="4"
ENABLE_INPUT_LOCK="0" ENABLE_INPUT_LOCK="0"
PROCESSING_TIME="" PROCESSING_TIME=""
NODE_ID="NODE_ID"
SSH_SERVER="" # Remote server or 'master'. SSH_SERVER="" # Remote server or 'master'.
SSH_KEY="" # SSH key for ssh account. SSH_KEY="" # SSH key for ssh account.
@ -119,7 +120,8 @@ REMOTE_OUTPUT_DIR="" # Remote directory to which output must
SCRIPT="" # Custom user script that is executed by ppss. SCRIPT="" # Custom user script that is executed by ppss.
ITEM_ESCAPED="" ITEM_ESCAPED=""
DISABLE_SKIPPING=0 DISABLE_SKIPPING=0
NODE_STATUS="$PPSS_DIR/$HOSTNAME-status.txt" PPSS_NODE_STATUS="$PPSS_DIR/NODE_STATUS"
NODE_STATUS_FILE="$PPSS_NODE_STATUS/$HOSTNAME-status.txt"
DAEMON=0 DAEMON=0
EMAIL="" EMAIL=""
@ -221,7 +223,6 @@ showusage_long () {
echo " config Generate a config file based on the supplied option parameters." echo " config Generate a config file based on the supplied option parameters."
echo " deploy Deploy PPSS and related files on the specified nodes." echo " deploy Deploy PPSS and related files on the specified nodes."
echo " erase Erase PPSS and related files from the specified nodes." echo " erase Erase PPSS and related files from the specified nodes."
echo " ec2 Start up Amazon EC2 instances and deploy PPSS on nodes."
echo echo
echo " start Starting PPSS on nodes." echo " start Starting PPSS on nodes."
echo " pause Pausing PPSS on all nodes." echo " pause Pausing PPSS on all nodes."
@ -1088,14 +1089,37 @@ init_vars () {
mkdir "$PPSS_LOCAL_OUTPUT" mkdir "$PPSS_LOCAL_OUTPUT"
fi fi
if [ ! -e "$PPSS_NODE_STATUS" ]
then
mkdir -p "$PPSS_NODE_STATUS"
fi
}
upload_status () {
scp -q $SSH_OPTS $SSH_KEY $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/
if [ "$?" == "0" ]
then
log DEBUG "Uploaded status to server ok."
else
log DEBUG "Uploaded status to server failed."
fi
} }
set_status () { set_status () {
STATUS="$1" STATUS="$1"
echo "$HOSTNAME $STATUS" > "$NODE_STATUS" NO_PROCESSED=$(wc -l "$PPSS_HOME_DIR/$LIST_OF_PROCESSED_ITEMS" | cut -d " " -f 1)
NODE=`cat $PPSS_DIR/$NODE_ID`
echo "$NODE $HOSTNAME $STATUS $NO_PROCESSED" > "$NODE_STATUS_FILE"
if [ ! -z "$SSH_SERVER" ]
then
upload_status
fi
} }
check_status () { check_status () {
ERROR="$1" ERROR="$1"
@ -1202,47 +1226,36 @@ stack_pop () {
fi fi
} }
ec2_get_pending_nodes() { is_screen_installed () {
#
# This function has naver been tested by the author of PPSS.
#
RES="$(ec2-describe-instances | grep 'INSTANCE' | awk '{print $4}'| grep pending)"
echo "$RES"
}
ec2_launch_nodes() { if [ "$DISABLE_SCREEN_TEST" == "1" ]
# then
# This function has naver been tested by the author of PPSS. return 0
# fi
ec2run $AMI_ID -n $NUM_NODES -t $INSTANCE_TYPE -k $AWS_KEYPAIR -g $SECURITY_GROUP
# NODE="$1"
# Loop until all nodes are started ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "screen -m -D -S test ls" > /dev/null 2>&1
# if [ ! "$?" == "0" ]
STARTING="`ec2_get_pending_nodes`" then
while [ ! -z "$STARTING" ] log ERROR "The 'Screen' command is not installed on node $NODE."
do return 1
sleep 10 else
STARTING="`ec2_get_pending_nodes`" log DEBUG "'Screen' is installed on node $NODE."
log DSPLY "$STARTING" fi
done
#
# Write all instances / nodes to the nodes file.
#
ec2-describe-instances | grep 'INSTANCE' | awk '{print $4}' | sed '/terminated/d' | sed '/pending/d' >> $NODES_FILE
NO_OF_NODES="`wc -l $NODES_FILE | awk '{ print $1 }'`"
log DSPLY "Number of nodes / instances: $NO_OF_NODES"
} }
deploy () { deploy () {
NODE="$1" NODE="$1"
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=socket-%h \ SSH_SOCKET="ppss_ssh_socket-$NODE"
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=$SSH_SOCKET \
-o GlobalKnownHostsFile=./known_hosts \ -o GlobalKnownHostsFile=./known_hosts \
-o ControlMaster=auto \ -o ControlMaster=auto \
-o Cipher=blowfish \ -o Cipher=blowfish \
-o ConnectTimeout=5 " -o ConnectTimeout=5 "
ERROR=0 ERROR=0
set_error () { set_error () {
@ -1252,20 +1265,15 @@ deploy () {
fi fi
} }
ssh -q -o ConnectTimeout=5 $SSH_KEY $USER@$NODE exit 0
set_error "$?"
if [ ! "$ERROR" == "0" ]
then
log ERROR "Cannot connect to node $NODE."
return
fi
ssh -N -M $SSH_OPTS_NODE $SSH_KEY $USER@$NODE & ssh -N -M $SSH_OPTS_NODE $SSH_KEY $USER@$NODE &
SSH_PID=$! SSH_PID=$!
is_screen_installed "$NODE"
KEY=`echo $SSH_KEY | cut -d " " -f 2` KEY=`echo $SSH_KEY | cut -d " " -f 2`
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && mkdir $PPSS_HOME_DIR >> /dev/null 2>&1" ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && mkdir -p $PPSS_HOME_DIR && mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR && mkdir -p $PPSS_HOME_DIR/ITEM_LOCK_DIR >> /dev/null 2>&1"
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && cd $PPSS_HOME_DIR && cd $PPSS_DIR && echo $NODE > $NODE_ID"
scp -q $SSH_OPTS_NODE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_NODE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_NODE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR
@ -1274,6 +1282,7 @@ deploy () {
set_error $? set_error $?
scp -q $SSH_OPTS_NODE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_NODE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR
set_error $? set_error $?
if [ ! -z "$SCRIPT" ] if [ ! -z "$SCRIPT" ]
then then
scp -q $SSH_OPTS_NODE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR scp -q $SSH_OPTS_NODE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR
@ -1298,19 +1307,20 @@ deploy () {
deploy_ppss () { deploy_ppss () {
if [ -z "$NODES_FILE" ] || [ ! -e "$NODES_FILE" ]
if [ -z "$NODES_FILE" ]
then then
log ERROR "ERROR - are you using the right option? -C ?" log ERROR "No file containing list of nodes missing / not specified."
set_status ERROR set_status ERROR
cleanup cleanup
exit 1 exit 1
fi fi
exec_cmd "mkdir $PPSS_HOME_DIR/$PPSS_NODE_STATUS"
KEY=`echo $SSH_KEY | cut -d " " -f 2` KEY=`echo $SSH_KEY | cut -d " " -f 2`
if [ -z "$KEY" ] || [ ! -e "$KEY" ] if [ -z "$KEY" ] || [ ! -e "$KEY" ]
then then
log ERROR "Nodes require a key file." log ERROR "Private SSH key $KEY not found."
cleanup cleanup
set_status "ERROR" set_status "ERROR"
exit 1 exit 1
@ -1325,17 +1335,6 @@ deploy_ppss () {
fi fi
INSTALLED_ON_SSH_SERVER=0 INSTALLED_ON_SSH_SERVER=0
if [ ! -e "$NODES_FILE" ]
then
log ERROR "File $NODES with list of nodes does not exist."
set_status "ERROR"
cleanup
exit 1
else
if [ "$EC2" == "1" ]
then
ec2_launch_nodes
fi
for NODE in `cat $NODES_FILE` for NODE in `cat $NODES_FILE`
do do
deploy "$NODE" & deploy "$NODE" &
@ -1347,19 +1346,15 @@ deploy_ppss () {
fi fi
if [ "$NODE" == "$SSH_SERVER" ] if [ "$NODE" == "$SSH_SERVER" ]
then then
log DEBUG "SSH SERVER $SSH_SERVER is also a node."
INSTALLED_ON_SSH_SERVER=1 INSTALLED_ON_SSH_SERVER=1
exec_cmd "mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR"
exec_cmd "mkdir -p $PPSS_HOME_DIR/$ITEM_LOCK_DIR"
fi fi
done done
if [ "$INSTALLED_ON_SSH_SERVER" == "0" ] if [ "$INSTALLED_ON_SSH_SERVER" == "0" ]
then then
log DEBUG "SSH SERVER $SSH_SERVER is not a node." log DEBUG "SSH SERVER $SSH_SERVER is not a node."
deploy "$SSH_SERVER" else
exec_cmd "mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR" log DEBUG "SSH SERVER $SSH_SERVER is also a node."
exec_cmd "mkdir -p $PPSS_HOME_DIR/$ITEM_LOCK_DIR"
fi
fi fi
} }
@ -1367,7 +1362,7 @@ start_ppss_on_node () {
NODE="$1" NODE="$1"
log DSPLY "Starting PPSS on node $NODE." log DSPLY "Starting PPSS on node $NODE."
ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG" ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 -o GlobalKnownHostsFile=./known_hosts "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG"
} }
test_server () { test_server () {
@ -1701,32 +1696,6 @@ list_all_input_items () {
IFS="$oldIFS" IFS="$oldIFS"
} }
return_difference_between_vars () {
VAR_A="$1"
VAR_B="$2"
for a in $VAR_A
do
A_IS_IN_B=0
for b in $VAR_B
do
if [ "$a" = "$b" ]
then
A_IS_IN_B=1
fi
done
if [ "$A_IS_IN_B" = "0" ]
then
echo "$a"
else
log DEBUG "Value $a occurs in 'var b'"
fi
done
}
remove_processed_items_from_input_file () { remove_processed_items_from_input_file () {
# #
@ -2679,17 +2648,30 @@ start_all_workers () {
done done
} }
#get_status_of_node () {
#
# NODE="$1"
# NODE_HOSTNAME=`ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE hostname`
# STATUS=`ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/$NODE_HOSTNAME-status.txt" 2>/dev/null`
# echo "STATUS=ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE cat $PPSS_HOME_DIR/$PPSS_DIR/$PPSS_NODE_STATUS/$NODE_HOSTNAME-status.txt 2>/dev/null" >> abc.txt
# ERROR="$?"
# if [ ! "$ERROR" == "0" ]
# then
# STATUS="UNKNOWN"
# fi
# echo "$STATUS"
#}
#get_node_status_from_server () {
get_status_of_node () { get_status_of_node () {
NODE="$1" RES=`ssh $SSH_OPTS $SSH_KEY $USER@$SERVER cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/*" 2> /dev/null`
NODE_HOSTNAME=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@NODE hostname` IFS=$'\n'
STATUS=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$NODE_HOSTNAME-status.txt" 2>/dev/null` for x in $RES
ERROR="$?" do
if [ ! "$ERROR" == "0" ] log DSPLY "$x"
then done
STATUS="UNKNOWN"
fi
echo "$STATUS"
} }
show_status () { show_status () {
@ -2706,7 +2688,7 @@ show_status () {
if [ ! -z "$ITEMS" ] && [ ! "$ITEMS" == "0" ] if [ ! -z "$ITEMS" ] && [ ! "$ITEMS" == "0" ]
then then
PROCESSED=`exec_cmd "ls -1 $ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null PROCESSED=`exec_cmd "ls -1 $PPSS_HOME_DIR/$ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null
TMP_STATUS=$((100 * $PROCESSED / $ITEMS)) TMP_STATUS=$((100 * $PROCESSED / $ITEMS))
log DSPLY "Status:\t\t$TMP_STATUS percent complete." log DSPLY "Status:\t\t$TMP_STATUS percent complete."
else else
@ -2720,12 +2702,12 @@ show_status () {
fi fi
log DSPLY "Items:\t\t$ITEMS" log DSPLY "Items:\t\t$ITEMS"
log DSPLY "---------------------------------------------------------" log DSPLY "---------------------------------------------------------"
HEADER=`echo IP-address Hostname Processed Status | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'` HEADER=`echo IP-address Hostname Processed Status | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'`
log DSPLY "$HEADER" log DSPLY "$HEADER"
log DSPLY "---------------------------------------------------------" log DSPLY "---------------------------------------------------------"
PROCESSED=0 PROCESSED=0
for x in `cat $NODES_FILE` for x in `cat $NODES_FILE`
do do
RES=0 RES=0
@ -2825,20 +2807,9 @@ main () {
exit 0 exit 0
;; ;;
deploy ) deploy )
LOGFILE=/dev/null LOGFILE=ppss-deploy.txt
display_header display_header
log DSPLY "Deploying PPSS on nodes." log DSPLY "Deploying PPSS on nodes. See ppss-deploy.txt for details."
deploy_ppss
wait
cleanup
exit 0
;;
ec2)
EC2=1
LOGFILE=/dev/null
display_header
log INFO "Deploying PPSS on EC2 nodes."
ec2_launch_nodes
deploy_ppss deploy_ppss
wait wait
cleanup cleanup
@ -2847,8 +2818,7 @@ main () {
status ) status )
LOGFILE=/dev/null LOGFILE=/dev/null
display_header display_header
init_vars test_server
get_all_items
show_status show_status
exit 0 exit 0
;; ;;

View File

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
DEBUG="$1" DEBUG="$1"
VERSION="2.86" VERSION="2.90"
TMP_DIR="/tmp/ppss" TMP_DIR="/tmp/ppss"
PPSS=./ppss PPSS=./ppss
PPSS_DIR=ppss_dir PPSS_DIR=ppss_dir