Retry failed and optional prefix for sharding_supervisor

Complements the changes just made to buildbot BUG=91709, 93091 Review URL: http://codereview.chromium.org/7670002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@97275 0039d316-1c4b-4281-b951-d872f2087c98
author: charleslee@chromium.org <charleslee@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-18 03:30:25 +0000
committer: charleslee@chromium.org <charleslee@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-18 03:30:25 +0000
commit: f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec (patch)
tree: 76a3b7809c8dd98feeae4495c1462d8877abd0ef /tools
parent: d0cf438c4d63c5127a35d76316a1b28e1e788376 (diff)
download: chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.zip
chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.tar.gz
chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.tar.bz2
1 files changed, 130 insertions, 50 deletions
diff --git a/tools/sharding_supervisor/sharding_supervisor.py b/tools/sharding_supervisor/sharding_supervisor.py
index 114b9fd..21bbab5 100755
--- a/tools/sharding_supervisor/sharding_supervisor.py
+++ b/tools/sharding_supervisor/sharding_supervisor.py
@@ -14,21 +14,33 @@ is started for that shard and the output is identical to gtest's output.
 """
 
 
-from cStringIO import StringIO
+import cStringIO
+import itertools
 import optparse
 import os
 import Queue
 import random
 import re
-import subprocess
 import sys
 import threading
 
+# Add tools/ to path
+BASE_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(BASE_PATH, ".."))
+try:
+  import find_depot_tools
+  # Fixes a bug in Windows where some shards die upon starting
+  import subprocess2 as subprocess
+except ImportError:
+  # Unable to find depot_tools, so just use standard subprocess
+  import subprocess
+
 
 SS_USAGE = "python %prog [options] path/to/test [gtest_args]"
 SS_DEFAULT_NUM_CORES = 4
 SS_DEFAULT_SHARDS_PER_CORE = 5 # num_shards = cores * SHARDS_PER_CORE
 SS_DEFAULT_RUNS_PER_CORE = 1 # num_workers = cores * RUNS_PER_CORE
+SS_DEFAULT_RETRY_PERCENT = 5 # --retry-failed ignored if more than 5% fail
 
 
 def DetectNumCores():
@@ -91,16 +103,17 @@ class ShardRunner(threading.Thread):
     self.test_fail = test_fail
     self.current_test = ""
 
-  def ReportFailure(self, description, prefix, test_name):
-    log_line = "%s: %s%s\n" % (description, prefix, test_name)
-    self.supervisor.LogLineFailure(log_line)
+  def ReportFailure(self, description, test_name):
+    log_line = "%s: %s\n" % (description, test_name)
+    self.supervisor.LogTestFailure(log_line)
 
-  def ProcessLine(self, prefix, line):
+  def ProcessLine(self, line):
     results = self.test_start.search(line)
     if results:
       if self.current_test:
-        self.ReportFailure("INCOMPLETE", prefix, self.current_test)
+        self.ReportFailure("INCOMPLETE", self.current_test)
       self.current_test = results.group(1)
+      self.supervisor.IncrementTestCount()
       return
 
     results = self.test_ok.search(line)
@@ -110,7 +123,7 @@ class ShardRunner(threading.Thread):
 
     results = self.test_fail.search(line)
     if results:
-      self.ReportFailure("FAILED", prefix, results.group(1))
+      self.ReportFailure("FAILED", results.group(1))
       self.current_test = ""
 
   def run(self):
@@ -125,8 +138,7 @@ class ShardRunner(threading.Thread):
         index = self.counter.get_nowait()
       except Queue.Empty:
         break
-      prefix = "%i>" % index
-      chars = StringIO()
+      chars = cStringIO.StringIO()
       shard_running = True
       shard = RunShard(
           self.supervisor.test, self.supervisor.num_shards, index,
@@ -140,11 +152,12 @@ class ShardRunner(threading.Thread):
           line = chars.getvalue()
           if not line and not shard_running:
             break
-          self.ProcessLine(prefix, line)
-          line = prefix + line
+          self.ProcessLine(line)
           self.supervisor.LogOutputLine(index, line)
           chars.close()
-          chars = StringIO()
+          chars = cStringIO.StringIO()
+      if self.current_test:
+        self.ReportFailure("INCOMPLETE", prefix, self.current_test)
       self.supervisor.ShardIndexCompleted(index)
       if shard.returncode != 0:
         self.supervisor.LogShardFailure(index)
@@ -159,25 +172,28 @@ class ShardingSupervisor(object):
     num_runs: Total number of worker threads to create for running shards.
     color: Indicates which coloring mode to use in the output.
     gtest_args: The options to pass to gtest.
-    failure_log: List of statements from shard output indicating a failure.
+    failed_tests: List of statements from shard output indicating a failure.
     failed_shards: List of shards that contained failing tests.
   """
 
   SHARD_COMPLETED = object()
 
-  def __init__(
-      self, test, num_shards, num_runs, color, reorder, gtest_args):
+  def __init__(self, test, num_shards, num_runs, color, original_order,
+               prefix, retry_percent, gtest_args):
     """Inits ShardingSupervisor with given options and gtest arguments."""
     self.test = test
     self.num_shards = num_shards
     self.num_runs = num_runs
     self.color = color
-    self.reorder = reorder
+    self.original_order = original_order
+    self.prefix = prefix
+    self.retry_percent = retry_percent
     self.gtest_args = gtest_args
-    self.failure_log = []
+    self.failed_tests = []
     self.failed_shards = []
     self.shards_completed = [False] * num_shards
     self.shard_output = [Queue.Queue() for _ in range(num_shards)]
+    self.test_counter = itertools.count()
 
   def ShardTest(self):
     """Runs the test and manages the worker threads.
@@ -185,10 +201,10 @@ class ShardingSupervisor(object):
     Runs the test and outputs a summary at the end. All the tests in the
     suite are run by creating (cores * runs_per_core) threads and
     (cores * shards_per_core) shards. When all the worker threads have
-    finished, the lines saved in the failure_log are printed again.
+    finished, the lines saved in failed_tests are printed again.
 
     Returns:
-      The number of shards that had failing tests.
+      1 if some unexpected (not FLAKY or FAILS) tests failed, 0 otherwise.
     """
 
     # Regular expressions for parsing GTest logs. Test names look like
@@ -218,22 +234,38 @@ class ShardingSupervisor(object):
           self, counter, test_start, test_ok, test_fail)
       worker.start()
       workers.append(worker)
-    if self.reorder:
-      self.WaitForShards()
-    else:
+    if self.original_order:
       for worker in workers:
         worker.join()
+    else:
+      self.WaitForShards()
 
-    return self.PrintSummary(self.failure_log)
-
-  def LogLineFailure(self, line):
+    num_failed = len(self.failed_shards)
+    if num_failed > 0:
+      self.failed_shards.sort()
+      self.WriteText(sys.stderr,
+                     "\nFAILED SHARDS: %s\n" % str(self.failed_shards),
+                     "\x1b[1;5;31m")
+    else:
+      self.WriteText(sys.stderr, "\nALL SHARDS PASSED!\n", "\x1b[1;5;32m")
+    self.PrintSummary(self.failed_tests)
+
+    self.failed_tests = [x for x in self.failed_tests if x.find("FAILS_") < 0]
+    self.failed_tests = [x for x in self.failed_tests if x.find("FLAKY_") < 0]
+    if not self.failed_tests:
+      return 0
+    if self.retry_percent < 0:
+      return len(self.failed_shards) > 0
+    return self.RetryFailedTests()
+
+  def LogTestFailure(self, line):
     """Saves a line in the failure log to be printed at the end.
 
     Args:
-      line: The line to save in the failure_log.
+      line: The line to save in the failed_tests list.
     """
-    if line not in self.failure_log:
-      self.failure_log.append(line)
+    if line not in self.failed_tests:
+      self.failed_tests.append(line)
 
   def LogShardFailure(self, index):
     """Records that a test in the given shard has failed.
@@ -252,39 +284,54 @@ class ShardingSupervisor(object):
         sys.stdout.write(line)
 
   def LogOutputLine(self, index, line):
-    if self.reorder:
-      self.shard_output[index].put(line)
-    else:
+    if self.prefix:
+      line = "%i>%s" % (index, line)
+    if self.original_order:
       sys.stdout.write(line)
+    else:
+      self.shard_output[index].put(line)
+
+  def IncrementTestCount(self):
+    self.test_counter.next()
 
   def ShardIndexCompleted(self, index):
     self.shard_output[index].put(self.SHARD_COMPLETED)
 
+  def RetryFailedTests(self):
+    num_tests_run = self.test_counter.next()
+    if len(self.failed_tests) > self.retry_percent * num_tests_run:
+      sys.stderr.write("\nNOT RETRYING FAILED TESTS (too many failed)\n")
+      return 1
+    self.WriteText(sys.stderr, "\nRETRYING FAILED TESTS:\n", "\x1b[1;5;33m")
+    sharded_description = re.compile(r": (?:\d+>)?(.*)")
+    gtest_filters = [sharded_description.search(line).group(1)
+                     for line in self.failed_tests]
+    failed_retries = []
+
+    for test_filter in gtest_filters:
+      args = [self.test, "--gtest_filter=" + test_filter]
+      args.extend(self.gtest_args)
+      rerun = subprocess.Popen(args)
+      rerun.wait()
+      if rerun.returncode != 0:
+        failed_retries.append(test_filter)
+
+    self.WriteText(sys.stderr, "RETRY RESULTS:\n", "\x1b[1;5;33m")
+    self.PrintSummary(failed_retries)
+    return len(failed_retries) > 0
+
   def PrintSummary(self, failed_tests):
     """Prints a summary of the test results.
 
     If any shards had failing tests, the list is sorted and printed. Then all
     the lines that indicate a test failure are reproduced.
-
-    Returns:
-      The number of shards that had failing tests.
     """
-    sys.stderr.write("\n")
-    num_failed = len(self.failed_shards)
-    if num_failed > 0:
-      self.failed_shards.sort()
-      self.WriteText(sys.stderr,
-                     "FAILED SHARDS: %s\n" % str(self.failed_shards),
-                     "\x1b[1;5;31m")
-    else:
-      self.WriteText(sys.stderr, "ALL SHARDS PASSED!\n", "\x1b[1;5;32m")
     if failed_tests:
       self.WriteText(sys.stderr, "FAILED TESTS:\n", "\x1b[1;5;31m")
       for line in failed_tests:
         sys.stderr.write(line)
-    if self.color:
-      sys.stderr.write("\x1b[m")
-    return num_failed
+    else:
+      self.WriteText(sys.stderr, "ALL TESTS PASSED!\n", "\x1b[1;5;32m")
 
   def WriteText(self, pipe, text, ansi):
     if self.color:
@@ -316,8 +363,26 @@ def main():
       "--reorder", action="store_true",
       help="ensure that all output from an earlier shard is printed before"
       " output from a later shard")
-  parser.add_option("--random-seed", action="store_true",
+  # TODO(charleslee): for backwards compatibility with master.cfg file
+  parser.add_option(
+      "--original-order", action="store_true",
+      help="print shard output in its orginal jumbled order of execution"
+      " (useful for debugging flaky tests)")
+  parser.add_option(
+      "--prefix", action="store_true",
+      help="prefix each line of shard output with 'N>', where N is the shard"
+      " index (forced True when --original-order is True)")
+  parser.add_option(
+      "--random-seed", action="store_true",
       help="shuffle the tests with a random seed value")
+  parser.add_option(
+      "--retry-failed", action="store_true",
+      help="retry tests that did not pass serially")
+  parser.add_option(
+      "--retry-percent", type="int",
+      default=SS_DEFAULT_RETRY_PERCENT,
+      help="ignore --retry-failed if more than this percent fail [0, 100]"
+      " (default = %i)" % SS_DEFAULT_RETRY_PERCENT)
   parser.disable_interspersed_args()
   (options, args) = parser.parse_args()
 
@@ -339,10 +404,24 @@ def main():
   gtest_args = ["--gtest_color=%s" % {
       True: "yes", False: "no"}[options.color]] + args[1:]
 
+  if options.original_order:
+    options.prefix = True
+
+  # TODO(charleslee): for backwards compatibility with buildbot's log_parser
+  if options.reorder:
+    options.original_order = False
+    options.prefix = True
+
   if options.random_seed:
     seed = random.randint(1, 99999)
     gtest_args.extend(["--gtest_shuffle", "--gtest_random_seed=%i" % seed])
 
+  if options.retry_failed:
+    if options.retry_percent < 0 or options.retry_percent > 100:
+      parser.error("Retry percent must be an integer [0, 100]!")
+  else:
+    options.retry_percent = -1
+
   if options.runshard != None:
     # run a single shard and exit
     if (options.runshard < 0 or options.runshard >= num_shards):
@@ -353,8 +432,9 @@ def main():
     return shard.poll()
 
   # shard and run the whole test
-  ss = ShardingSupervisor(args[0], num_shards, num_runs, options.color,
-                          options.reorder, gtest_args)
+  ss = ShardingSupervisor(
+      args[0], num_shards, num_runs, options.color, options.original_order,
+      options.prefix, options.retry_percent, gtest_args)
   return ss.ShardTest()
author	charleslee@chromium.org <charleslee@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-18 03:30:25 +0000
committer	charleslee@chromium.org <charleslee@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-18 03:30:25 +0000
commit	f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec (patch)
tree	76a3b7809c8dd98feeae4495c1462d8877abd0ef /tools
parent	d0cf438c4d63c5127a35d76316a1b28e1e788376 (diff)
download	chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.zip chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.tar.gz chromium_src-f25b8f41b1dcc4581ce0d46e1d45515d5c9f59ec.tar.bz2