Work around CHILD_MAX bash limitation for async
Apparently bash (via POSIX) only guarantees a small (32ish) number of
children can be started and their statuses retrieved at any given
point. On larger jobs with lots of plugins and additional work, we
may go over that limit, especially for long-lived children, such
as the install_tempest task.
This works around that issue by creating a fifo for each child at
spawn time. When the child is complete, it will block on a read
against that fifo (and thus not exit). When the parent goes to wait
on the child, it first writes to that fifo, unblocking the child so
that it can exit near the time we go to wait.
Closes-Bug: #1923728
Change-Id: Id755bdb1e7f1664ec08742d034c174e87a3d2902
diff --git a/inc/async b/inc/async
index c63bc20..11bcdfa 100644
--- a/inc/async
+++ b/inc/async
@@ -57,6 +57,7 @@
function async_inner {
local name="$1"
local rc
+ local fifo=${DEST}/async/${name}.fifo
shift
set -o xtrace
if $* >${DEST}/async/${name}.log 2>&1; then
@@ -69,6 +70,8 @@
async_log "$name" "FAILED with rc $rc"
fi
iniset ${DEST}/async/${name}.ini job end_time $(date "+%s%3N")
+ # Block on the fifo until we are signaled to exit by the main process
+ cat $fifo
return $rc
}
@@ -86,12 +89,14 @@
local name="$1"
shift
local inifile=${DEST}/async/${name}.ini
+ local fifo=${DEST}/async/${name}.fifo
touch $inifile
iniset $inifile job command "$*"
iniset $inifile job start_time $(date +%s%3N)
if [[ "$DEVSTACK_PARALLEL" = "True" ]]; then
+ mkfifo $fifo
async_inner $name $* &
iniset $inifile job pid $!
async_log "$name" "running: %command"
@@ -119,17 +124,23 @@
xtrace=$(set +o | grep xtrace)
set +o xtrace
- local pid rc running inifile runtime
+ local pid rc running inifile runtime fifo
rc=0
for name in $*; do
running=$(ls ${DEST}/async/*.ini 2>/dev/null | wc -l)
inifile="${DEST}/async/${name}.ini"
+ fifo=${DEST}/async/${name}.fifo
if pid=$(async_pidof "$name"); then
async_log "$name" "Waiting for completion of %command" \
"($running other jobs running)"
time_start async_wait
if [[ "$pid" != "self" ]]; then
+ # Signal the child to go ahead and exit since we are about to
+ # wait for it to collect its status.
+ echo "Signaling exit"
+ echo WAKEUP > $fifo
+ echo "Signaled"
# Do not actually call wait if we ran synchronously
if wait $pid; then
rc=0
@@ -137,6 +148,7 @@
rc=$?
fi
cat ${DEST}/async/${name}.log
+ rm -f $fifo
fi
time_stop async_wait
local start_time