[BE] Fix bug in flaky test uploading

I noticed that the latest windows c10_dist_gloo flakiness did not get caught by flaky test bot. 238d01ec90 The issue was because c10_dist_gloo is a test file that spawns duplicates intentionally, and I had excluded those in my "duplicate flaky test rerun" calculation. I've now updated the logic to handle when those intentional duplicated tests are flaky as well. This PR also cleans up some past outdated logic. Pull Request resolved: https://github.com/pytorch/pytorch/pull/74712 Approved by: https://github.com/malfet
2025-12-06 00:20:18 +01:00 · 2022-03-30 19:20:44 +00:00 · 2022-03-30 19:20:44 +00:00 · b0d7cd0111
commit b0d7cd0111
parent 9233af181f
1 changed files with 30 additions and 30 deletions
--- a/tools/stats/print_test_stats.py
+++ b/tools/stats/print_test_stats.py
@ -576,6 +576,10 @@ class TestCase:
        self.class_name = str(dom.attributes['classname'].value)
        self.name = str(dom.attributes['name'].value)
        self.time = float(dom.attributes['time'].value)
+        # The following attribute is currently ONLY used in process_intentional_test_runs for validation
+        # reasons. The test filename that populates TestFile is calculated and passed down through the test report path.
+        # The reason we don't just use this attribute is because it doesn't exist for cpp tests, e.g., in test_libtorch
+        self.file = str(dom.attributes['file'].value) if dom.hasAttribute('file') else 'N/A - probably a cpp test'
        error_elements = dom.getElementsByTagName('error')
        # DISCLAIMER: unexpected successes and expected failures are currently not reported in assemble_s3_object
        self.expected_failure = False
@ -601,9 +605,9 @@ class TestCase:
        return self.__str__()

    def __str__(self) -> str:
-        return f'[TestCase name: {self.name} | class_name: {self.class_name} | time: {self.time} | ' \
+        return f'[TestCase name: {self.name} | class_name: {self.class_name} | file: {self.file} | time: {self.time} | ' \
            f'expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | ' \
-            f'unexpected_success: {self.unexpected_success} | failed: {self.failed}]'
+            f'unexpected_success: {self.unexpected_success} | failed: {self.failed}]\n'

 class TestSuite:
    def __init__(self, name: str) -> None:
@ -644,6 +648,17 @@ class TestSuite:
        self.test_cases[name].expected_failure |= test_case.expected_failure


+# Tests that spawn duplicates (usually only twice) intentionally
+MULTITESTS = [
+    'test_cpp_extensions_aot',
+    'distributed/test_distributed_spawn',
+    'distributed\\test_distributed_spawn',  # for windows
+    'distributed/test_c10d_gloo',
+    'distributed\\test_c10d_gloo',  # for windows
+    'cpp'  # The caffe2 cpp tests spawn duplicate test cases as well.
+]
+
+
 DuplicatedDict = Dict[str, Dict[str, List[TestCase]]]

 class TestFile:
@ -653,27 +668,20 @@ class TestFile:
        self.test_suites: Dict[str, TestSuite] = dict()

    def append(self, test_case: TestCase, test_type: str, duplicated_tests_dict: DuplicatedDict) -> None:
-        is_multi_test = self.name == 'test_cpp_extensions_aot' or \
-            self.name == 'distributed/test_distributed_spawn' or \
-            self.name == 'distributed/test_c10d_gloo' or \
-            self.name == 'cpp'  # The caffe2 cpp tests spawn duplicate test cases as well.
-        if is_multi_test:
-            suite_name = test_case.class_name + '__' + test_type
-        else:
-            suite_name = test_case.class_name
+        suite_name = test_case.class_name
        if suite_name not in self.test_suites:
            self.test_suites[suite_name] = TestSuite(suite_name)
        if test_case.name in self.test_suites[suite_name].test_cases:
-            if is_multi_test:
+            if self.name in MULTITESTS:
                self.test_suites[suite_name].update(test_case)
                self.total_time += test_case.time
-            else:
-                # Gather up duplicated test cases
-                if suite_name not in duplicated_tests_dict:
-                    duplicated_tests_dict[suite_name] = dict()
-                if test_case.name not in duplicated_tests_dict[suite_name]:
-                    duplicated_tests_dict[suite_name][test_case.name] = [self.test_suites[suite_name].test_cases[test_case.name]]
-                duplicated_tests_dict[suite_name][test_case.name].append(test_case)
+
+            # Gather up duplicated test cases to parse for flaky reruns
+            if suite_name not in duplicated_tests_dict:
+                duplicated_tests_dict[suite_name] = dict()
+            if test_case.name not in duplicated_tests_dict[suite_name]:
+                duplicated_tests_dict[suite_name][test_case.name] = [self.test_suites[suite_name].test_cases[test_case.name]]
+            duplicated_tests_dict[suite_name][test_case.name].append(test_case)
        else:
            self.test_suites[suite_name].append(test_case)
            self.total_time += test_case.time
@ -743,17 +751,9 @@ def process_intentional_test_runs(runs: List[TestCase]) -> Tuple[int, int]:
        else:
            num_pass += 1

-    REPEAT_TEST_FOR_TYPES_TESTS = [
-        "test_data_parallel_module",
-        "test_data_parallel_module_kwargs_only",
-        "test_data_parallel_module_kwargs_only_empty_list",
-        "test_data_parallel_module_kwargs_only_empty_dict",
-        "test_data_parallel_module_kwargs_only_empty_tuple"
-    ]
-
-    # Do not run checks for tests that use repeat_test_for_types decorator as they do not go well with our retry
-    # functionality. Once issue https://github.com/pytorch/pytorch/issues/69865 is fixed, we should remove the exception
-    if not any([x in test_run.name for x in REPEAT_TEST_FOR_TYPES_TESTS]):
+    # Do not run duplication checks for test files that spawn duplicate tests intentionally
+    # and are not necessarily flaky test reruns.
+    if not any(x in test_run.file for x in MULTITESTS):
        err_msg = f'Warning: unintentional test case duplicates found for {test_run.name} in suite {test_run.class_name}.'
        report_only = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') != '1'
        if report_only and num_fail + num_errored + num_unexpected_success < 1 or not report_only and num_expected_fail < 1:
@ -780,7 +780,7 @@ def assemble_flaky_test_stats(duplicated_tests_by_file: Dict[str, DuplicatedDict
        for suite_name, testcase_to_runs in suite_to_dict.items():
            for testcase_name, list_of_runs in testcase_to_runs.items():
                num_green, num_red = process_intentional_test_runs(list_of_runs)
-                if num_green > 0:   # Otherwise, it's likely just a failing test
+                if num_green > 0 and num_red > 0:   # Flaky tests show different results in consecutive reruns
                    flaky_tests.append({
                        "name": testcase_name,
                        "suite": suite_name,