Resolving lots of bugs, backup.
This commit is contained in:
parent
2946abc3d6
commit
126e71e231
236
ppss
236
ppss
@ -26,12 +26,12 @@
|
||||
trap 'kill_process' SIGINT
|
||||
|
||||
SCRIPT_NAME="Distributed Parallel Processing Shell Script"
|
||||
SCRIPT_VERSION="2.86"
|
||||
SCRIPT_VERSION="2.90"
|
||||
|
||||
#
|
||||
# The first argument to this script can be a mode.
|
||||
#
|
||||
MODES="node start config stop pause continue deploy status erase kill ec2"
|
||||
MODES="node start config stop pause continue deploy status erase kill"
|
||||
for x in $MODES
|
||||
do
|
||||
if [ "$x" == "$1" ]
|
||||
@ -92,6 +92,7 @@ STAT=""
|
||||
DAEMON_FILE_AGE="4"
|
||||
ENABLE_INPUT_LOCK="0"
|
||||
PROCESSING_TIME=""
|
||||
NODE_ID="NODE_ID"
|
||||
|
||||
SSH_SERVER="" # Remote server or 'master'.
|
||||
SSH_KEY="" # SSH key for ssh account.
|
||||
@ -119,7 +120,8 @@ REMOTE_OUTPUT_DIR="" # Remote directory to which output must
|
||||
SCRIPT="" # Custom user script that is executed by ppss.
|
||||
ITEM_ESCAPED=""
|
||||
DISABLE_SKIPPING=0
|
||||
NODE_STATUS="$PPSS_DIR/$HOSTNAME-status.txt"
|
||||
PPSS_NODE_STATUS="$PPSS_DIR/NODE_STATUS"
|
||||
NODE_STATUS_FILE="$PPSS_NODE_STATUS/$HOSTNAME-status.txt"
|
||||
DAEMON=0
|
||||
EMAIL=""
|
||||
|
||||
@ -221,7 +223,6 @@ showusage_long () {
|
||||
echo " config Generate a config file based on the supplied option parameters."
|
||||
echo " deploy Deploy PPSS and related files on the specified nodes."
|
||||
echo " erase Erase PPSS and related files from the specified nodes."
|
||||
echo " ec2 Start up Amazon EC2 instances and deploy PPSS on nodes."
|
||||
echo
|
||||
echo " start Starting PPSS on nodes."
|
||||
echo " pause Pausing PPSS on all nodes."
|
||||
@ -1088,14 +1089,37 @@ init_vars () {
|
||||
mkdir "$PPSS_LOCAL_OUTPUT"
|
||||
fi
|
||||
|
||||
if [ ! -e "$PPSS_NODE_STATUS" ]
|
||||
then
|
||||
mkdir -p "$PPSS_NODE_STATUS"
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
upload_status () {
|
||||
|
||||
scp -q $SSH_OPTS $SSH_KEY $NODE_STATUS_FILE $USER@$SSH_SERVER:$PPSS_HOME_DIR/$PPSS_NODE_STATUS/
|
||||
if [ "$?" == "0" ]
|
||||
then
|
||||
log DEBUG "Uploaded status to server ok."
|
||||
else
|
||||
log DEBUG "Uploaded status to server failed."
|
||||
fi
|
||||
}
|
||||
|
||||
set_status () {
|
||||
|
||||
STATUS="$1"
|
||||
echo "$HOSTNAME $STATUS" > "$NODE_STATUS"
|
||||
NO_PROCESSED=$(wc -l "$PPSS_HOME_DIR/$LIST_OF_PROCESSED_ITEMS" | cut -d " " -f 1)
|
||||
NODE=`cat $PPSS_DIR/$NODE_ID`
|
||||
echo "$NODE $HOSTNAME $STATUS $NO_PROCESSED" > "$NODE_STATUS_FILE"
|
||||
if [ ! -z "$SSH_SERVER" ]
|
||||
then
|
||||
upload_status
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
check_status () {
|
||||
|
||||
ERROR="$1"
|
||||
@ -1202,47 +1226,36 @@ stack_pop () {
|
||||
fi
|
||||
}
|
||||
|
||||
ec2_get_pending_nodes() {
|
||||
#
|
||||
# This function has naver been tested by the author of PPSS.
|
||||
#
|
||||
RES="$(ec2-describe-instances | grep 'INSTANCE' | awk '{print $4}'| grep pending)"
|
||||
echo "$RES"
|
||||
}
|
||||
is_screen_installed () {
|
||||
|
||||
ec2_launch_nodes() {
|
||||
#
|
||||
# This function has naver been tested by the author of PPSS.
|
||||
#
|
||||
ec2run $AMI_ID -n $NUM_NODES -t $INSTANCE_TYPE -k $AWS_KEYPAIR -g $SECURITY_GROUP
|
||||
if [ "$DISABLE_SCREEN_TEST" == "1" ]
|
||||
then
|
||||
return 0
|
||||
fi
|
||||
|
||||
#
|
||||
# Loop until all nodes are started
|
||||
#
|
||||
STARTING="`ec2_get_pending_nodes`"
|
||||
while [ ! -z "$STARTING" ]
|
||||
do
|
||||
sleep 10
|
||||
STARTING="`ec2_get_pending_nodes`"
|
||||
log DSPLY "$STARTING"
|
||||
done
|
||||
#
|
||||
# Write all instances / nodes to the nodes file.
|
||||
#
|
||||
ec2-describe-instances | grep 'INSTANCE' | awk '{print $4}' | sed '/terminated/d' | sed '/pending/d' >> $NODES_FILE
|
||||
NO_OF_NODES="`wc -l $NODES_FILE | awk '{ print $1 }'`"
|
||||
log DSPLY "Number of nodes / instances: $NO_OF_NODES"
|
||||
NODE="$1"
|
||||
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "screen -m -D -S test ls" > /dev/null 2>&1
|
||||
if [ ! "$?" == "0" ]
|
||||
then
|
||||
log ERROR "The 'Screen' command is not installed on node $NODE."
|
||||
return 1
|
||||
else
|
||||
log DEBUG "'Screen' is installed on node $NODE."
|
||||
fi
|
||||
}
|
||||
|
||||
deploy () {
|
||||
|
||||
NODE="$1"
|
||||
|
||||
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=socket-%h \
|
||||
SSH_SOCKET="ppss_ssh_socket-$NODE"
|
||||
|
||||
SSH_OPTS_NODE="-o BatchMode=yes -o ControlPath=$SSH_SOCKET \
|
||||
-o GlobalKnownHostsFile=./known_hosts \
|
||||
-o ControlMaster=auto \
|
||||
-o Cipher=blowfish \
|
||||
-o ConnectTimeout=5 "
|
||||
|
||||
ERROR=0
|
||||
set_error () {
|
||||
|
||||
@ -1252,20 +1265,15 @@ deploy () {
|
||||
fi
|
||||
}
|
||||
|
||||
ssh -q -o ConnectTimeout=5 $SSH_KEY $USER@$NODE exit 0
|
||||
set_error "$?"
|
||||
if [ ! "$ERROR" == "0" ]
|
||||
then
|
||||
log ERROR "Cannot connect to node $NODE."
|
||||
return
|
||||
fi
|
||||
|
||||
ssh -N -M $SSH_OPTS_NODE $SSH_KEY $USER@$NODE &
|
||||
SSH_PID=$!
|
||||
|
||||
is_screen_installed "$NODE"
|
||||
|
||||
KEY=`echo $SSH_KEY | cut -d " " -f 2`
|
||||
|
||||
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && mkdir $PPSS_HOME_DIR >> /dev/null 2>&1"
|
||||
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && mkdir -p $PPSS_HOME_DIR && mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR && mkdir -p $PPSS_HOME_DIR/ITEM_LOCK_DIR >> /dev/null 2>&1"
|
||||
ssh -q $SSH_OPTS_NODE $SSH_KEY $USER@$NODE "cd ~ && cd $PPSS_HOME_DIR && cd $PPSS_DIR && echo $NODE > $NODE_ID"
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $0 $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $KEY $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
@ -1274,6 +1282,7 @@ deploy () {
|
||||
set_error $?
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY known_hosts $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
set_error $?
|
||||
|
||||
if [ ! -z "$SCRIPT" ]
|
||||
then
|
||||
scp -q $SSH_OPTS_NODE $SSH_KEY $SCRIPT $USER@$NODE:~/$PPSS_HOME_DIR
|
||||
@ -1298,19 +1307,20 @@ deploy () {
|
||||
|
||||
deploy_ppss () {
|
||||
|
||||
|
||||
if [ -z "$NODES_FILE" ]
|
||||
if [ -z "$NODES_FILE" ] || [ ! -e "$NODES_FILE" ]
|
||||
then
|
||||
log ERROR "ERROR - are you using the right option? -C ?"
|
||||
log ERROR "No file containing list of nodes missing / not specified."
|
||||
set_status ERROR
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exec_cmd "mkdir $PPSS_HOME_DIR/$PPSS_NODE_STATUS"
|
||||
|
||||
KEY=`echo $SSH_KEY | cut -d " " -f 2`
|
||||
if [ -z "$KEY" ] || [ ! -e "$KEY" ]
|
||||
then
|
||||
log ERROR "Nodes require a key file."
|
||||
log ERROR "Private SSH key $KEY not found."
|
||||
cleanup
|
||||
set_status "ERROR"
|
||||
exit 1
|
||||
@ -1325,41 +1335,26 @@ deploy_ppss () {
|
||||
fi
|
||||
|
||||
INSTALLED_ON_SSH_SERVER=0
|
||||
if [ ! -e "$NODES_FILE" ]
|
||||
then
|
||||
log ERROR "File $NODES with list of nodes does not exist."
|
||||
set_status "ERROR"
|
||||
cleanup
|
||||
exit 1
|
||||
else
|
||||
if [ "$EC2" == "1" ]
|
||||
then
|
||||
ec2_launch_nodes
|
||||
fi
|
||||
for NODE in `cat $NODES_FILE`
|
||||
do
|
||||
deploy "$NODE" &
|
||||
if [ "$ARCH" == "SunOS" ]
|
||||
then
|
||||
sleep 1
|
||||
else
|
||||
sleep 0.1
|
||||
fi
|
||||
if [ "$NODE" == "$SSH_SERVER" ]
|
||||
then
|
||||
log DEBUG "SSH SERVER $SSH_SERVER is also a node."
|
||||
INSTALLED_ON_SSH_SERVER=1
|
||||
exec_cmd "mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR"
|
||||
exec_cmd "mkdir -p $PPSS_HOME_DIR/$ITEM_LOCK_DIR"
|
||||
fi
|
||||
done
|
||||
if [ "$INSTALLED_ON_SSH_SERVER" == "0" ]
|
||||
for NODE in `cat $NODES_FILE`
|
||||
do
|
||||
deploy "$NODE" &
|
||||
if [ "$ARCH" == "SunOS" ]
|
||||
then
|
||||
log DEBUG "SSH SERVER $SSH_SERVER is not a node."
|
||||
deploy "$SSH_SERVER"
|
||||
exec_cmd "mkdir -p $PPSS_HOME_DIR/$JOB_LOG_DIR"
|
||||
exec_cmd "mkdir -p $PPSS_HOME_DIR/$ITEM_LOCK_DIR"
|
||||
sleep 1
|
||||
else
|
||||
sleep 0.1
|
||||
fi
|
||||
if [ "$NODE" == "$SSH_SERVER" ]
|
||||
then
|
||||
INSTALLED_ON_SSH_SERVER=1
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$INSTALLED_ON_SSH_SERVER" == "0" ]
|
||||
then
|
||||
log DEBUG "SSH SERVER $SSH_SERVER is not a node."
|
||||
else
|
||||
log DEBUG "SSH SERVER $SSH_SERVER is also a node."
|
||||
fi
|
||||
}
|
||||
|
||||
@ -1367,7 +1362,7 @@ start_ppss_on_node () {
|
||||
|
||||
NODE="$1"
|
||||
log DSPLY "Starting PPSS on node $NODE."
|
||||
ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG"
|
||||
ssh $SSH_KEY $USER@$NODE -o ConnectTimeout=5 -o GlobalKnownHostsFile=./known_hosts "cd $PPSS_HOME_DIR ; screen -d -m -S PPSS ~/$PPSS_HOME_DIR/$0 node --config ~/$PPSS_HOME_DIR/$CONFIG"
|
||||
}
|
||||
|
||||
test_server () {
|
||||
@ -1701,32 +1696,6 @@ list_all_input_items () {
|
||||
IFS="$oldIFS"
|
||||
}
|
||||
|
||||
return_difference_between_vars () {
|
||||
|
||||
VAR_A="$1"
|
||||
VAR_B="$2"
|
||||
|
||||
for a in $VAR_A
|
||||
do
|
||||
A_IS_IN_B=0
|
||||
|
||||
for b in $VAR_B
|
||||
do
|
||||
if [ "$a" = "$b" ]
|
||||
then
|
||||
A_IS_IN_B=1
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$A_IS_IN_B" = "0" ]
|
||||
then
|
||||
echo "$a"
|
||||
else
|
||||
log DEBUG "Value $a occurs in 'var b'"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
remove_processed_items_from_input_file () {
|
||||
|
||||
#
|
||||
@ -2679,17 +2648,30 @@ start_all_workers () {
|
||||
done
|
||||
}
|
||||
|
||||
#get_status_of_node () {
|
||||
#
|
||||
# NODE="$1"
|
||||
# NODE_HOSTNAME=`ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE hostname`
|
||||
# STATUS=`ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/$NODE_HOSTNAME-status.txt" 2>/dev/null`
|
||||
# echo "STATUS=ssh $SSH_OPTS_NODE $SSH_KEY $USER@$NODE cat $PPSS_HOME_DIR/$PPSS_DIR/$PPSS_NODE_STATUS/$NODE_HOSTNAME-status.txt 2>/dev/null" >> abc.txt
|
||||
# ERROR="$?"
|
||||
# if [ ! "$ERROR" == "0" ]
|
||||
# then
|
||||
# STATUS="UNKNOWN"
|
||||
# fi
|
||||
# echo "$STATUS"
|
||||
#}
|
||||
|
||||
#get_node_status_from_server () {
|
||||
get_status_of_node () {
|
||||
|
||||
NODE="$1"
|
||||
NODE_HOSTNAME=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@NODE hostname`
|
||||
STATUS=`ssh -o ConnectTimeout=10 $SSH_KEY $USER@$NODE cat "$PPSS_HOME_DIR/$NODE_HOSTNAME-status.txt" 2>/dev/null`
|
||||
ERROR="$?"
|
||||
if [ ! "$ERROR" == "0" ]
|
||||
then
|
||||
STATUS="UNKNOWN"
|
||||
fi
|
||||
echo "$STATUS"
|
||||
RES=`ssh $SSH_OPTS $SSH_KEY $USER@$SERVER cat "$PPSS_HOME_DIR/$PPSS_NODE_STATUS/*" 2> /dev/null`
|
||||
IFS=$'\n'
|
||||
for x in $RES
|
||||
do
|
||||
log DSPLY "$x"
|
||||
done
|
||||
|
||||
}
|
||||
|
||||
show_status () {
|
||||
@ -2706,7 +2688,7 @@ show_status () {
|
||||
|
||||
if [ ! -z "$ITEMS" ] && [ ! "$ITEMS" == "0" ]
|
||||
then
|
||||
PROCESSED=`exec_cmd "ls -1 $ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null
|
||||
PROCESSED=`exec_cmd "ls -1 $PPSS_HOME_DIR/$ITEM_LOCK_DIR 2>/dev/null | wc -l" 1` 2>&1 >> /dev/null
|
||||
TMP_STATUS=$((100 * $PROCESSED / $ITEMS))
|
||||
log DSPLY "Status:\t\t$TMP_STATUS percent complete."
|
||||
else
|
||||
@ -2720,12 +2702,12 @@ show_status () {
|
||||
fi
|
||||
log DSPLY "Items:\t\t$ITEMS"
|
||||
|
||||
|
||||
log DSPLY "---------------------------------------------------------"
|
||||
HEADER=`echo IP-address Hostname Processed Status | awk '{ printf ("%-16s %-18s % 10s %10s\n",$1,$2,$3,$4) }'`
|
||||
log DSPLY "$HEADER"
|
||||
log DSPLY "---------------------------------------------------------"
|
||||
PROCESSED=0
|
||||
|
||||
for x in `cat $NODES_FILE`
|
||||
do
|
||||
RES=0
|
||||
@ -2825,20 +2807,9 @@ main () {
|
||||
exit 0
|
||||
;;
|
||||
deploy )
|
||||
LOGFILE=/dev/null
|
||||
LOGFILE=ppss-deploy.txt
|
||||
display_header
|
||||
log DSPLY "Deploying PPSS on nodes."
|
||||
deploy_ppss
|
||||
wait
|
||||
cleanup
|
||||
exit 0
|
||||
;;
|
||||
ec2)
|
||||
EC2=1
|
||||
LOGFILE=/dev/null
|
||||
display_header
|
||||
log INFO "Deploying PPSS on EC2 nodes."
|
||||
ec2_launch_nodes
|
||||
log DSPLY "Deploying PPSS on nodes. See ppss-deploy.txt for details."
|
||||
deploy_ppss
|
||||
wait
|
||||
cleanup
|
||||
@ -2847,8 +2818,7 @@ main () {
|
||||
status )
|
||||
LOGFILE=/dev/null
|
||||
display_header
|
||||
init_vars
|
||||
get_all_items
|
||||
test_server
|
||||
show_status
|
||||
exit 0
|
||||
;;
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
DEBUG="$1"
|
||||
VERSION="2.86"
|
||||
VERSION="2.90"
|
||||
TMP_DIR="/tmp/ppss"
|
||||
PPSS=./ppss
|
||||
PPSS_DIR=ppss_dir
|
||||
|
Loading…
Reference in New Issue
Block a user