Add integration test to detect excessive duplicate 'when' conditions in tasks files

This test scans all .yml/.yaml files under any tasks/ directory and flags cases where the same 'when' condition appears on more than 3 tasks in the same file. Excessive duplication of identical conditions can harm Ansible performance because the condition is re-evaluated for every task. The test suggests replacing repeated conditions with an include_tasks call or a block guarded by the condition to evaluate it only once. https://chatgpt.com/share/6899c605-6f40-800f-a954-ccb62f8bbcf1
2025-12-14 13:05:25 +00:00 · 2025-08-11 12:29:57 +02:00
parent 6e04ac58d2
commit 6318611931
1 changed files with 202 additions and 0 deletions
--- a/tests/integration/test_when_condition_duplication.py
+++ b/tests/integration/test_when_condition_duplication.py
@@ -0,0 +1,202 @@
+# tests/integration/test_when_condition_duplication.py
+import os
+import unittest
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Tuple
+
+try:
+    import yaml  # PyYAML
+except ImportError as e:
+    raise SystemExit(
+        "PyYAML is required to run this test. Install with: pip install pyyaml"
+    ) from e
+
+
+THRESHOLD = 3  # fail if the same when-condition occurs on more than this many tasks
+
+
+def _find_repo_root_containing(marker_names: Iterable[str], max_up: int = 8) -> Path:
+    """
+    Walk upwards from this file to find the repo root. We assume the project root
+    contains at least one of `marker_names` (e.g., 'roles', '.git', 'playbooks').
+    """
+    here = Path(__file__).resolve().parent
+    cur = here
+    for _ in range(max_up):
+        for marker in marker_names:
+            if (cur / marker).exists():
+                return cur
+        if cur.parent == cur:
+            break
+        cur = cur.parent
+    # Fallback: repo root assumed 3 levels up from tests/integration
+    return Path(__file__).resolve().parents[3]
+
+
+def _normalize_when(value: Any) -> str:
+    """
+    Normalize Ansible 'when' to a comparable string:
+    - If it's a list, join with ' && ' preserving order (order matters in Ansible).
+    - If it's a scalar, strip leading/trailing whitespace.
+    - Represent everything as a single-line string for stable comparison.
+    """
+    if value is None:
+        return ""
+    if isinstance(value, list):
+        parts = []
+        for v in value:
+            s = "" if v is None else str(v).strip()
+            # collapse internal whitespace runs to a single space for stability
+            s = " ".join(s.split())
+            parts.append(s)
+        return " && ".join(parts)
+    # scalar (str, int, bool, jinja template, etc.)
+    s = str(value).strip()
+    return " ".join(s.split())
+
+
+def _iter_tasks(node: Any) -> Iterable[Dict[str, Any]]:
+    """
+    Yield task-like dicts (those which may contain 'when') from arbitrary YAML structures.
+    Handles:
+      - Top-level lists of tasks
+      - Dicts that contain keys like 'block', 'rescue', 'always' (Ansible blocks)
+      - Nested lists/dicts recursively
+    We only yield a dict once as a "task" (the one that has a 'when' or looks like a task).
+    """
+    if isinstance(node, list):
+        for item in node:
+            yield from _iter_tasks(item)
+    elif isinstance(node, dict):
+        # If this dict itself looks like a task (has module keys or 'when'/'name'),
+        # yield it, but also traverse nested blocks.
+        is_task_like = any(
+            k in node
+            for k in (
+                "when",
+                "name",
+                "block",
+                "rescue",
+                "always",
+                "include_tasks",
+                "import_tasks",
+                "ansible.builtin.include_tasks",
+                "ansible.builtin.import_tasks",
+            )
+        )
+        if is_task_like:
+            yield node
+
+        # Recurse into Ansible block sections if present
+        for section in ("block", "rescue", "always"):
+            if section in node and isinstance(node[section], list):
+                for item in node[section]:
+                    yield from _iter_tasks(item)
+        # Also traverse other nested structures conservatively
+        for k, v in node.items():
+            if k not in ("block", "rescue", "always"):
+                if isinstance(v, (list, dict)):
+                    yield from _iter_tasks(v)
+
+
+def _load_yaml_documents(path: Path) -> List[Any]:
+    """
+    Load all YAML documents from a file. Best-effort parsing:
+    - If YAML fails due to Jinja syntax, we still raise, because a broken file
+      should be fixed in the repo.
+    """
+    text = path.read_text(encoding="utf-8")
+    return list(yaml.safe_load_all(text))  # may return [None] if empty
+
+
+def _collect_when_counts(yaml_docs: List[Any]) -> Dict[str, List[Tuple[str, str]]]:
+    """
+    Return a mapping: normalized_when -> list of (task_name, hint_location)
+    where each entry corresponds to a task that uses that 'when'.
+    """
+    counts: Dict[str, List[Tuple[str, str]]] = {}
+    for doc in yaml_docs:
+        for task in _iter_tasks(doc):
+            if "when" not in task:
+                continue
+            normalized = _normalize_when(task.get("when"))
+            if not normalized:
+                continue
+            task_name = str(task.get("name") or "<unnamed task>")
+            # Provide a minimal hint for where this came from (e.g., module/inclusion used)
+            hint = None
+            for key in (
+                "include_tasks",
+                "import_tasks",
+                "ansible.builtin.include_tasks",
+                "ansible.builtin.import_tasks",
+            ):
+                if key in task:
+                    hint = f"{key}: {task[key]}"
+                    break
+            hint_loc = hint or "task"
+            counts.setdefault(normalized, []).append((task_name, hint_loc))
+    return counts
+
+
+class WhenConditionDuplicationTest(unittest.TestCase):
+    """
+    Integration test that ensures we don't repeat the same 'when' condition
+    on too many tasks in a single tasks file.
+
+    Rationale:
+      Repeating identical 'when' across many tasks forces Ansible to evaluate
+      the same condition over and over, which is bad for performance. Prefer
+      factoring those tasks into a dedicated file and use `include_tasks`
+      (or a block with a single 'when') to evaluate once.
+    """
+
+    def test_excessive_repeated_when_in_tasks_files(self):
+        repo_root = _find_repo_root_containing(marker_names=(".git", "roles", "playbooks"))
+        tasks_globs = [
+            "**/tasks/**/*.yml",
+            "**/tasks/**/*.yaml",
+        ]
+
+        violations: List[str] = []
+
+        for pattern in tasks_globs:
+            for path in repo_root.glob(pattern):
+                # Only scan files that are inside the project workspace
+                if not path.is_file():
+                    continue
+
+                try:
+                    docs = _load_yaml_documents(path)
+                except Exception as exc:
+                    self.fail(f"Failed to parse YAML file: {path}\n{exc}")
+
+                counts = _collect_when_counts(docs)
+                for normalized_when, occurrences in counts.items():
+                    if len(occurrences) > THRESHOLD:
+                        # Build a helpful error message showing a few sample tasks with this condition
+                        sample = "\n".join(
+                            f"    - {tname} ({hint})"
+                            for tname, hint in occurrences[:5]
+                        )
+                        violations.append(
+                            (
+                                f"{path} uses the same 'when' condition more than {THRESHOLD} times "
+                                f"({len(occurrences)} occurrences):\n"
+                                f"  WHEN: {normalized_when}\n"
+                                f"  Sample tasks:\n{sample}\n"
+                                f"Suggestion: Group these tasks into a separate file and call it with "
+                                f"`include_tasks`, or use a single `block` guarded by this 'when' to avoid "
+                                f"re-evaluating the condition repeatedly."
+                            )
+                        )
+
+        if violations:
+            self.fail(
+                "Excessive duplicate 'when' conditions detected (hurts performance):\n\n"
+                + "\n\n".join(violations)
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()