Build retry loop for screen sessions

There is a timing window where we might lose the commands being
stuffed into screen because bash is spawning. In those cases, loop
around and try building screen sessions again.

Change-Id: I49247de06bbd59424cb10fb9a8db145907be5138
Related-Bug: #1331274
diff --git a/functions-common b/functions-common
index 613a86c..0bf354f 100644
--- a/functions-common
+++ b/functions-common
@@ -1058,44 +1058,100 @@
     echo $!
 }
 
+function _start_in_screen {
+    local service=$1
+    local cmd=$2
+    local screen_name=${SCREEN_NAME:-stack}
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
+    local service_dir="$status_dir/$screen_name"
+    local pid="$service_dir/$service.pid"
+    local failure="$service_dir/$service.failure"
+
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
+        local shortlog=${SCREEN_LOGDIR}/screen-${service}.log
+        # this whole dance is done because of slow nodes
+        screen -S $screen_name -p $service -X logfile ${logfile}
+        screen -S $screen_name -p $service -X log on
+        ln -sf ${logfile} ${shortlog}
+    fi
+
+    NL=`echo -ne '\015'`
+    # This fun command does the following:
+    # - the passed server command is backgrounded
+    # - the pid of the background process is saved in the usual place
+    # - the server process is brought back to the foreground
+    # - if the server process exits prematurely the fg command errors
+    #   and a message is written to stdout and the service failure file
+    # The pid saved can be used in screen_stop() as a process group
+    # id to kill off all child processes
+    echo "Running: $cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
+    screen -S $screen_name -p $service -X stuff "$cmd & echo \$! >$pid; fg || echo \"$service failed to start\" | tee \"$failure\"$NL"
+}
+
+
+function _is_running_in_screen {
+    local service=$1
+    local screen_name=${SCREEN_NAME:-stack}
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
+    local service_dir="$status_dir/$screen_name"
+    local pid="$service_dir/$service.pid"
+    local failure="$service_dir/$service.failure"
+    if [[ ! -e "$pid" && ! -e "$failure" ]]; then
+        # if we don't have a pid or a failure for why, the command may not
+        # have stuffed in there
+        echo "Warning: neither $pid nor $failure exist, $service didn't seem to start"
+        return 1
+    fi
+    if [[ -n ${SCREEN_LOGDIR} ]]; then
+        # if we should be logging, but we don't have a log file, something is wrong
+        local logfile=${SCREEN_LOGDIR}/screen-${service}.${CURRENT_LOG_TIME}.log
+        if [[ ! -e "$logfile" ]]; then
+            echo "Warning: expected logfile $logfile not found, something wrong with starting $service"
+            return 1
+        fi
+    fi
+    return 0
+}
+
 # Helper to launch a service in a named screen
 # screen_it service "command-line"
 function screen_it {
-    SCREEN_NAME=${SCREEN_NAME:-stack}
-    SERVICE_DIR=${SERVICE_DIR:-${DEST}/status}
-    USE_SCREEN=$(trueorfalse True $USE_SCREEN)
+    local service=$1
+    local cmd=$2
+    local screen_name=${SCREEN_NAME:-stack}
+    local status_dir=${SERVICE_DIR:-${DEST}/status}
+    local service_dir="$status_dir/$screen_name"
+    local use_screen=$(trueorfalse True $USE_SCREEN)
+    local pid="$service_dir/$service.pid"
 
     if is_service_enabled $1; then
         # Append the service to the screen rc file
-        screen_rc "$1" "$2"
+        screen_rc "$service" "$cmd"
 
-        if [[ "$USE_SCREEN" = "True" ]]; then
-            screen -S $SCREEN_NAME -X screen -t $1
+        if [[ "$use_screen" = "True" ]]; then
+            screen -S $screen_name -X screen -t $service
 
-            if [[ -n ${SCREEN_LOGDIR} ]]; then
-                screen -S $SCREEN_NAME -p $1 -X logfile ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log
-                screen -S $SCREEN_NAME -p $1 -X log on
-                ln -sf ${SCREEN_LOGDIR}/screen-${1}.${CURRENT_LOG_TIME}.log ${SCREEN_LOGDIR}/screen-${1}.log
-            fi
-
-            # sleep to allow bash to be ready to be send the command - we are
-            # creating a new window in screen and then sends characters, so if
-            # bash isn't running by the time we send the command, nothing happens
-            sleep 3
-
-            NL=`echo -ne '\015'`
-            # This fun command does the following:
-            # - the passed server command is backgrounded
-            # - the pid of the background process is saved in the usual place
-            # - the server process is brought back to the foreground
-            # - if the server process exits prematurely the fg command errors
-            #   and a message is written to stdout and the service failure file
-            # The pid saved can be used in screen_stop() as a process group
-            # id to kill off all child processes
-            screen -S $SCREEN_NAME -p $1 -X stuff "$2 & echo \$! >$SERVICE_DIR/$SCREEN_NAME/$1.pid; fg || echo \"$1 failed to start\" | tee \"$SERVICE_DIR/$SCREEN_NAME/$1.failure\"$NL"
+            # this retry loop brought to you by slow compute nodes, screen raciness,
+            # and frustration in upgrading.
+            local screen_tries=0
+            while [ "$screen_tries" -lt 10 ]; do
+                _start_in_screen "$service" "$cmd"
+                if _is_running_in_screen $service; then
+                    screen_tries=10
+                else
+                    screen_tries=$[screen_tries + 1]
+                    echo "Failed to start service after $screen_tries attempt(s), retrying"
+                    if [[ "$screen_tries" -eq 10 ]]; then
+                        echo "Too many retries, giving up"
+                        exit 1
+                    fi
+                    sleep 1
+                fi
+            done
         else
             # Spawn directly without screen
-            run_process "$1" "$2" >$SERVICE_DIR/$SCREEN_NAME/$1.pid
+            run_process "$service" "$cmd" >$pid
         fi
     fi
 }