Async task support

We have a *ton* of stuff in devstack that is very linear, specifically
the ten-ish minutes we spend loading osc to run a single API command
against something. We also generate configs, sync databases, and other
things that use one core of our worker and make our runtime longer
than it really needs to be.

The idea in this patch is to make it super simple to run some things
in the background and then wait for them to finish before proceeding
to something that will require them to be done. This avoids the
interleaving you would expect by redirecting the async tasks to a log
file, and then cat'ing that log file synchronously during the wait
operation. The per-task log file remains so it's easier to examine
it in isolation.

Multiple people have reported between 22-30% improvement in the
time it takes to stack with this. More can be done, but what is here
already makes a significant difference.

Change-Id: I270a910b531641b023c13f75dfedca057a1f1031
diff --git a/.zuul.yaml b/.zuul.yaml
index c140671..7b0696b 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -599,6 +599,17 @@
     voting: false
 
 - job:
+    name: devstack-async
+    parent: tempest-full-py3
+    description: Async mode enabled
+    voting: false
+    vars:
+      devstack_localrc:
+        DEVSTACK_PARALLEL: True
+      zuul_copy_output:
+        /opt/stack/async: logs
+
+- job:
     name: devstack-platform-fedora-latest
     parent: tempest-full-py3
     description: Fedora latest platform test
@@ -690,6 +701,7 @@
         - devstack-platform-fedora-latest
         - devstack-platform-centos-8
         - devstack-platform-bionic
+        - devstack-async
         - devstack-multinode
         - devstack-unit-tests
         - openstack-tox-bashate
diff --git a/clean.sh b/clean.sh
index 4cebf1d..870dfd4 100755
--- a/clean.sh
+++ b/clean.sh
@@ -113,7 +113,7 @@
 cleanup_database
 
 # Clean out data and status
-sudo rm -rf $DATA_DIR $DEST/status
+sudo rm -rf $DATA_DIR $DEST/status $DEST/async
 
 # Clean out the log file and log directories
 if [[ -n "$LOGFILE" ]] && [[ -f "$LOGFILE" ]]; then
diff --git a/extras.d/80-tempest.sh b/extras.d/80-tempest.sh
index 15ecfe3..06c73ec 100644
--- a/extras.d/80-tempest.sh
+++ b/extras.d/80-tempest.sh
@@ -6,7 +6,7 @@
         source $TOP_DIR/lib/tempest
     elif [[ "$1" == "stack" && "$2" == "install" ]]; then
         echo_summary "Installing Tempest"
-        install_tempest
+        async_runfunc install_tempest
     elif [[ "$1" == "stack" && "$2" == "post-config" ]]; then
         # Tempest config must come after layer 2 services are running
         :
@@ -17,6 +17,7 @@
         # local.conf Tempest option overrides
         :
     elif [[ "$1" == "stack" && "$2" == "test-config" ]]; then
+        async_wait install_tempest
         echo_summary "Initializing Tempest"
         configure_tempest
         echo_summary "Installing Tempest Plugins"
diff --git a/functions b/functions
index fc87a55..89bbab2 100644
--- a/functions
+++ b/functions
@@ -21,6 +21,7 @@
 source ${FUNC_DIR}/inc/meta-config
 source ${FUNC_DIR}/inc/python
 source ${FUNC_DIR}/inc/rootwrap
+source ${FUNC_DIR}/inc/async
 
 # Save trace setting
 _XTRACE_FUNCTIONS=$(set +o | grep xtrace)
diff --git a/inc/async b/inc/async
new file mode 100644
index 0000000..d29168f
--- /dev/null
+++ b/inc/async
@@ -0,0 +1,225 @@
+#!/bin/bash
+#
+# Symbolic asynchronous tasks for devstack
+#
+# Usage:
+#
+#  async_runfunc my_shell_func foo bar baz
+#
+#  ... do other stuff ...
+#
+#  async_wait my_shell_func
+#
+
+DEVSTACK_PARALLEL=$(trueorfalse False DEVSTACK_PARALLEL)
+_ASYNC_BG_TIME=0
+
+# Keep track of how much total time was spent in background tasks
+# Takes a job runtime in ms.
+function _async_incr_bg_time {
+    local elapsed_ms="$1"
+    _ASYNC_BG_TIME=$(($_ASYNC_BG_TIME + $elapsed_ms))
+}
+
+# Get the PID of a named future to wait on
+function async_pidof {
+    local name="$1"
+    local inifile="${DEST}/async/${name}.ini"
+
+    if [ -f "$inifile" ]; then
+        iniget $inifile job pid
+    else
+        echo 'UNKNOWN'
+        return 1
+    fi
+}
+
+# Log a message about a job. If the message contains "%command" then the
+# full command line of the job will be substituted in the output
+function async_log {
+    local name="$1"
+    shift
+    local message="$*"
+    local inifile=${DEST}/async/${name}.ini
+    local pid
+    local command
+
+    pid=$(iniget $inifile job pid)
+    command=$(iniget $inifile job command | tr '#' '-')
+    message=$(echo "$message" | sed "s#%command#$command#g")
+
+    echo "[Async ${name}:${pid}]: $message"
+}
+
+# Inner function that actually runs the requested task. We wrap it like this
+# just so we can emit a finish message as soon as the work is done, to make
+# it easier to find the tracking just before an error.
+function async_inner {
+    local name="$1"
+    local rc
+    shift
+    set -o xtrace
+    if $* >${DEST}/async/${name}.log 2>&1; then
+        rc=0
+        set +o xtrace
+        async_log "$name" "finished successfully"
+    else
+        rc=$?
+        set +o xtrace
+        async_log "$name" "FAILED with rc $rc"
+    fi
+    iniset ${DEST}/async/${name}.ini job end_time $(date "+%s%3N")
+    return $rc
+}
+
+# Run something async. Takes a symbolic name and a list of arguments of
+# what to run. Ideally this would be rarely used and async_runfunc() would
+# be used everywhere for readability.
+#
+# This spawns the work in a background worker, records a "future" to be
+# collected by a later call to async_wait()
+function async_run {
+    local xtrace
+    xtrace=$(set +o | grep xtrace)
+    set +o xtrace
+
+    local name="$1"
+    shift
+    local inifile=${DEST}/async/${name}.ini
+
+    touch $inifile
+    iniset $inifile job command "$*"
+    iniset $inifile job start_time $(date +%s%3N)
+
+    if [[ "$DEVSTACK_PARALLEL" = "True" ]]; then
+        async_inner $name $* &
+        iniset $inifile job pid $!
+        async_log "$name" "running: %command"
+        $xtrace
+    else
+        iniset $inifile job pid "self"
+        async_log "$name" "Running synchronously: %command"
+        $xtrace
+        $*
+        return $?
+    fi
+}
+
+# Shortcut for running a shell function async. Uses the function name as the
+# async name.
+function async_runfunc {
+    async_run $1 $*
+}
+
+# Wait for an async future to complete. May return immediately if already
+# complete, or of the future has already been waited on (avoid this). May
+# block until the future completes.
+function async_wait {
+    local xtrace
+    xtrace=$(set +o | grep xtrace)
+    set +o xtrace
+
+    local pid rc running inifile runtime
+    rc=0
+    for name in $*; do
+        running=$(ls ${DEST}/async/*.ini 2>/dev/null | wc -l)
+        inifile="${DEST}/async/${name}.ini"
+
+        if pid=$(async_pidof "$name"); then
+            async_log "$name" "Waiting for completion of %command" \
+                      "($running other jobs running)"
+            time_start async_wait
+            if [[ "$pid" != "self" ]]; then
+                # Do not actually call wait if we ran synchronously
+                if wait $pid; then
+                    rc=0
+                else
+                    rc=$?
+                fi
+                cat ${DEST}/async/${name}.log
+            fi
+            time_stop async_wait
+            local start_time
+            local end_time
+            start_time=$(iniget $inifile job start_time)
+            end_time=$(iniget $inifile job end_time)
+            _async_incr_bg_time $(($end_time - $start_time))
+            runtime=$((($end_time - $start_time) / 1000))
+            async_log "$name" "finished %command with result" \
+                      "$rc in $runtime seconds"
+            rm -f $inifile
+            if [ $rc -ne 0 ]; then
+                echo Stopping async wait due to error: $*
+                break
+            fi
+        else
+            # This could probably be removed - it is really just here
+            # to help notice if you wait for something by the wrong
+            # name, but it also shows up for things we didn't start
+            # because they were not enabled.
+            echo Not waiting for async task $name that we never started or \
+                 has already been waited for
+        fi
+    done
+
+    $xtrace
+    return $rc
+}
+
+# Check for uncollected futures and wait on them
+function async_cleanup {
+    local name
+
+    if [[ "$DEVSTACK_PARALLEL" != "True" ]]; then
+        return 0
+    fi
+
+    for inifile in $(find ${DEST}/async -name '*.ini'); do
+        name=$(basename $pidfile .ini)
+        echo "WARNING: uncollected async future $name"
+        async_wait $name || true
+    done
+}
+
+# Make sure our async dir is created and clean
+function async_init {
+    local async_dir=${DEST}/async
+
+    # Clean any residue if present from previous runs
+    rm -Rf $async_dir
+
+    # Make sure we have a state directory
+    mkdir -p $async_dir
+}
+
+function async_print_timing {
+    local bg_time_minus_wait
+    local elapsed_time
+    local serial_time
+    local speedup
+
+    if [[ "$DEVSTACK_PARALLEL" != "True" ]]; then
+        return 0
+    fi
+
+    # The logic here is: All the background task time would be
+    # serialized if we did not do them in the background. So we can
+    # add that to the elapsed time for the whole run. However, time we
+    # spend waiting for async things to finish adds to the elapsed
+    # time, but is time where we're not doing anything useful. Thus,
+    # we substract that from the would-be-serialized time.
+
+    bg_time_minus_wait=$((\
+            ($_ASYNC_BG_TIME - ${_TIME_TOTAL[async_wait]}) / 1000))
+    elapsed_time=$(($(date "+%s") - $_TIME_BEGIN))
+    serial_time=$(($elapsed_time + $bg_time_minus_wait))
+
+    echo
+    echo "================="
+    echo " Async summary"
+    echo "================="
+    echo " Time spent in the background minus waits: $bg_time_minus_wait sec"
+    echo " Elapsed time: $elapsed_time sec"
+    echo " Time if we did everything serially: $serial_time sec"
+    echo " Speedup: " $(echo | awk "{print $serial_time / $elapsed_time}")
+}
diff --git a/lib/keystone b/lib/keystone
index d4c7b06..66e867c 100644
--- a/lib/keystone
+++ b/lib/keystone
@@ -318,25 +318,25 @@
     local admin_role="admin"
     local member_role="member"
 
-    get_or_add_user_domain_role $admin_role $admin_user default
+    async_run ks-domain-role get_or_add_user_domain_role $admin_role $admin_user default
 
     # Create service project/role
     get_or_create_domain "$SERVICE_DOMAIN_NAME"
-    get_or_create_project "$SERVICE_PROJECT_NAME" "$SERVICE_DOMAIN_NAME"
+    async_run ks-project get_or_create_project "$SERVICE_PROJECT_NAME" "$SERVICE_DOMAIN_NAME"
 
     # Service role, so service users do not have to be admins
-    get_or_create_role service
+    async_run ks-service get_or_create_role service
 
     # The ResellerAdmin role is used by Nova and Ceilometer so we need to keep it.
     # The admin role in swift allows a user to act as an admin for their project,
     # but ResellerAdmin is needed for a user to act as any project. The name of this
     # role is also configurable in swift-proxy.conf
-    get_or_create_role ResellerAdmin
+    async_run ks-reseller get_or_create_role ResellerAdmin
 
     # another_role demonstrates that an arbitrary role may be created and used
     # TODO(sleepsonthefloor): show how this can be used for rbac in the future!
     local another_role="anotherrole"
-    get_or_create_role $another_role
+    async_run ks-anotherrole get_or_create_role $another_role
 
     # invisible project - admin can't see this one
     local invis_project
@@ -349,10 +349,12 @@
     demo_user=$(get_or_create_user "demo" \
         "$ADMIN_PASSWORD" "default" "demo@example.com")
 
-    get_or_add_user_project_role $member_role $demo_user $demo_project
-    get_or_add_user_project_role $admin_role $admin_user $demo_project
-    get_or_add_user_project_role $another_role $demo_user $demo_project
-    get_or_add_user_project_role $member_role $demo_user $invis_project
+    async_wait ks-{domain-role,domain,project,service,reseller,anotherrole}
+
+    async_run ks-demo-member get_or_add_user_project_role $member_role $demo_user $demo_project
+    async_run ks-demo-admin get_or_add_user_project_role $admin_role $admin_user $demo_project
+    async_run ks-demo-another get_or_add_user_project_role $another_role $demo_user $demo_project
+    async_run ks-demo-invis get_or_add_user_project_role $member_role $demo_user $invis_project
 
     # alt_demo
     local alt_demo_project
@@ -361,9 +363,9 @@
     alt_demo_user=$(get_or_create_user "alt_demo" \
         "$ADMIN_PASSWORD" "default" "alt_demo@example.com")
 
-    get_or_add_user_project_role $member_role $alt_demo_user $alt_demo_project
-    get_or_add_user_project_role $admin_role $admin_user $alt_demo_project
-    get_or_add_user_project_role $another_role $alt_demo_user $alt_demo_project
+    async_run ks-alt-member get_or_add_user_project_role $member_role $alt_demo_user $alt_demo_project
+    async_run ks-alt-admin get_or_add_user_project_role $admin_role $admin_user $alt_demo_project
+    async_run ks-alt-another get_or_add_user_project_role $another_role $alt_demo_user $alt_demo_project
 
     # groups
     local admin_group
@@ -373,11 +375,15 @@
     non_admin_group=$(get_or_create_group "nonadmins" \
         "default" "non-admin group")
 
-    get_or_add_group_project_role $member_role $non_admin_group $demo_project
-    get_or_add_group_project_role $another_role $non_admin_group $demo_project
-    get_or_add_group_project_role $member_role $non_admin_group $alt_demo_project
-    get_or_add_group_project_role $another_role $non_admin_group $alt_demo_project
-    get_or_add_group_project_role $admin_role $admin_group $admin_project
+    async_run ks-group-memberdemo get_or_add_group_project_role $member_role $non_admin_group $demo_project
+    async_run ks-group-anotherdemo get_or_add_group_project_role $another_role $non_admin_group $demo_project
+    async_run ks-group-memberalt get_or_add_group_project_role $member_role $non_admin_group $alt_demo_project
+    async_run ks-group-anotheralt get_or_add_group_project_role $another_role $non_admin_group $alt_demo_project
+    async_run ks-group-admin get_or_add_group_project_role $admin_role $admin_group $admin_project
+
+    async_wait ks-demo-{member,admin,another,invis}
+    async_wait ks-alt-{member,admin,another}
+    async_wait ks-group-{memberdemo,anotherdemo,memberalt,anotheralt,admin}
 
     if is_service_enabled ldap; then
         create_ldap_domain
diff --git a/lib/nova b/lib/nova
index d742603..0a28cd9 100644
--- a/lib/nova
+++ b/lib/nova
@@ -741,31 +741,36 @@
     sudo install -d -o $STACK_USER ${NOVA_STATE_PATH} ${NOVA_STATE_PATH}/keys
 }
 
+function init_nova_db {
+    local dbname="$1"
+    local conffile="$2"
+    recreate_database $dbname
+    $NOVA_BIN_DIR/nova-manage --config-file $conffile db sync --local_cell
+}
+
 # init_nova() - Initialize databases, etc.
 function init_nova {
     # All nova components talk to a central database.
     # Only do this step once on the API node for an entire cluster.
     if is_service_enabled $DATABASE_BACKENDS && is_service_enabled n-api; then
+        # (Re)create nova databases
+        async_run nova-cell-0 init_nova_db nova_cell0 $NOVA_CONF
+        for i in $(seq 1 $NOVA_NUM_CELLS); do
+            async_run nova-cell-$i init_nova_db nova_cell${i} $(conductor_conf $i)
+        done
+
         recreate_database $NOVA_API_DB
         $NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF api_db sync
 
-        recreate_database nova_cell0
-
         # map_cell0 will create the cell mapping record in the nova_api DB so
-        # this needs to come after the api_db sync happens. We also want to run
-        # this before the db sync below since that will migrate both the nova
-        # and nova_cell0 databases.
+        # this needs to come after the api_db sync happens.
         $NOVA_BIN_DIR/nova-manage cell_v2 map_cell0 --database_connection `database_connection_url nova_cell0`
 
-        # (Re)create nova databases
-        for i in $(seq 1 $NOVA_NUM_CELLS); do
-            recreate_database nova_cell${i}
-            $NOVA_BIN_DIR/nova-manage --config-file $(conductor_conf $i) db sync --local_cell
+        # Wait for DBs to finish from above
+        for i in $(seq 0 $NOVA_NUM_CELLS); do
+            async_wait nova-cell-$i
         done
 
-        # Migrate nova and nova_cell0 databases.
-        $NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF db sync
-
         # Run online migrations on the new databases
         # Needed for flavor conversion
         $NOVA_BIN_DIR/nova-manage --config-file $NOVA_CONF db online_data_migrations
diff --git a/stack.sh b/stack.sh
index 036afd7..dcfef6d 100755
--- a/stack.sh
+++ b/stack.sh
@@ -330,6 +330,9 @@
     safe_chmod 0755 $DATA_DIR
 fi
 
+# Create and/or clean the async state directory
+async_init
+
 # Configure proper hostname
 # Certain services such as rabbitmq require that the local hostname resolves
 # correctly.  Make sure it exists in /etc/hosts so that is always true.
@@ -1082,19 +1085,19 @@
 
     create_keystone_accounts
     if is_service_enabled nova; then
-        create_nova_accounts
+        async_runfunc create_nova_accounts
     fi
     if is_service_enabled glance; then
-        create_glance_accounts
+        async_runfunc create_glance_accounts
     fi
     if is_service_enabled cinder; then
-        create_cinder_accounts
+        async_runfunc create_cinder_accounts
     fi
     if is_service_enabled neutron; then
-        create_neutron_accounts
+        async_runfunc create_neutron_accounts
     fi
     if is_service_enabled swift; then
-        create_swift_accounts
+        async_runfunc create_swift_accounts
     fi
 
 fi
@@ -1107,9 +1110,11 @@
 
 if is_service_enabled horizon; then
     echo_summary "Configuring Horizon"
-    configure_horizon
+    async_runfunc configure_horizon
 fi
 
+async_wait create_nova_accounts create_glance_accounts create_cinder_accounts
+async_wait create_neutron_accounts create_swift_accounts configure_horizon
 
 # Glance
 # ------
@@ -1117,7 +1122,7 @@
 # NOTE(yoctozepto): limited to node hosting the database which is the controller
 if is_service_enabled $DATABASE_BACKENDS && is_service_enabled glance; then
     echo_summary "Configuring Glance"
-    init_glance
+    async_runfunc init_glance
 fi
 
 
@@ -1131,7 +1136,7 @@
 
     # Run init_neutron only on the node hosting the Neutron API server
     if is_service_enabled $DATABASE_BACKENDS && is_service_enabled neutron; then
-        init_neutron
+        async_runfunc init_neutron
     fi
 fi
 
@@ -1161,7 +1166,7 @@
 
 if is_service_enabled swift; then
     echo_summary "Configuring Swift"
-    init_swift
+    async_runfunc init_swift
 fi
 
 
@@ -1170,7 +1175,7 @@
 
 if is_service_enabled cinder; then
     echo_summary "Configuring Cinder"
-    init_cinder
+    async_runfunc init_cinder
 fi
 
 # Placement Service
@@ -1178,9 +1183,16 @@
 
 if is_service_enabled placement; then
     echo_summary "Configuring placement"
-    init_placement
+    async_runfunc init_placement
 fi
 
+# Wait for neutron and placement before starting nova
+async_wait init_neutron
+async_wait init_placement
+async_wait init_glance
+async_wait init_swift
+async_wait init_cinder
+
 # Compute Service
 # ---------------
 
@@ -1192,7 +1204,7 @@
     # TODO(stephenfin): Is it possible for neutron to *not* be enabled now? If
     # not, remove the if here
     if is_service_enabled neutron; then
-        configure_neutron_nova
+        async_runfunc configure_neutron_nova
     fi
 fi
 
@@ -1236,6 +1248,8 @@
     iniset $CINDER_CONF key_manager fixed_key "$FIXED_KEY"
 fi
 
+async_wait configure_neutron_nova
+
 # Launch the nova-api and wait for it to answer before continuing
 if is_service_enabled n-api; then
     echo_summary "Starting Nova API"
@@ -1282,7 +1296,7 @@
 if is_service_enabled nova; then
     echo_summary "Starting Nova"
     start_nova
-    create_flavors
+    async_runfunc create_flavors
 fi
 if is_service_enabled cinder; then
     echo_summary "Starting Cinder"
@@ -1331,6 +1345,8 @@
     start_horizon
 fi
 
+async_wait create_flavors
+
 
 # Create account rc files
 # =======================
@@ -1467,8 +1483,12 @@
     exec 1>&3
 fi
 
+# Make sure we didn't leak any background tasks
+async_cleanup
+
 # Dump out the time totals
 time_totals
+async_print_timing
 
 # Using the cloud
 # ===============
diff --git a/unstack.sh b/unstack.sh
index 3197cf1..d9dca7c 100755
--- a/unstack.sh
+++ b/unstack.sh
@@ -184,3 +184,4 @@
 fi
 
 clean_pyc_files
+rm -Rf $DEST/async