Add argument to stop stress test on first error

If a single stress test action fails it is often useful to stop the
processing completely since all other action would fail too.

Change-Id: Iaab9b508cb243a69d70d3101c01ae53c01612d2c
diff --git a/tempest/stress/driver.py b/tempest/stress/driver.py
index c4c2041..d9b95e0 100644
--- a/tempest/stress/driver.py
+++ b/tempest/stress/driver.py
@@ -14,6 +14,7 @@
 
 import logging
 import multiprocessing
+import signal
 import time
 
 from tempest import clients
@@ -45,6 +46,7 @@
 # add the handler to the root logger
 logger = logging.getLogger('tempest.stress')
 logger.addHandler(_console)
+processes = []
 
 
 def do_ssh(command, host):
@@ -93,10 +95,29 @@
     return None
 
 
-def stress_openstack(tests, duration, max_runs=None):
+def sigchld_handler(signal, frame):
+    """
+    Signal handler (only active if stop_on_error is True).
+    """
+    terminate_all_processes()
+
+
+def terminate_all_processes():
+    """
+    Goes through the process list and terminates all child processes.
+    """
+    for process in processes:
+        if process['process'].is_alive():
+            try:
+                process['process'].terminate()
+            except Exception:
+                pass
+        process['process'].join()
+
+
+def stress_openstack(tests, duration, max_runs=None, stop_on_error=False):
     """
     Workload driver. Executes an action function against a nova-cluster.
-
     """
     logfiles = admin_manager.config.stress.target_logfiles
     log_check_interval = int(admin_manager.config.stress.log_check_interval)
@@ -105,7 +126,6 @@
         computes = _get_compute_nodes(controller)
         for node in computes:
             do_ssh("rm -f %s" % logfiles, node)
-    processes = []
     for test in tests:
         if test.get('use_admin', False):
             manager = admin_manager
@@ -127,7 +147,7 @@
                                           tenant_name=tenant_name)
 
             test_obj = importutils.import_class(test['action'])
-            test_run = test_obj(manager, logger, max_runs)
+            test_run = test_obj(manager, logger, max_runs, stop_on_error)
 
             kwargs = test.get('kwargs', {})
             test_run.setUp(**dict(kwargs.iteritems()))
@@ -150,6 +170,9 @@
 
             processes.append(process)
             p.start()
+    if stop_on_error:
+        # NOTE(mkoderer): only the parent should register the handler
+        signal.signal(signal.SIGCHLD, sigchld_handler)
     end_time = time.time() + duration
     had_errors = False
     while True:
@@ -168,6 +191,11 @@
                 break
 
         time.sleep(min(remaining, log_check_interval))
+        if stop_on_error:
+            for process in processes:
+                if process['statistic']['fails'] > 0:
+                    break
+
         if not logfiles:
             continue
         errors = _error_in_logs(logfiles, computes)
@@ -175,10 +203,7 @@
             had_errors = True
             break
 
-    for process in processes:
-        if process['process'].is_alive():
-            process['process'].terminate()
-        process['process'].join()
+    terminate_all_processes()
 
     sum_fails = 0
     sum_runs = 0