From e417bc19bd20a7f1d4d425c7ee3a0f59212f2882 Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Tue, 26 Aug 2025 10:58:17 +0200 Subject: [PATCH] Refactor sys-ctl-rpr-docker-soft role to use standalone Python script with argparse and unittests - Replace Jinja2 template (script.py.j2) with raw Python script (files/script.py) - Add argparse options: --manipulation, --manipulation-string, --timeout - Implement timeout handling in wait_while_manipulation_running - Update systemd ExecStart/ExecStartPre handling in tasks/01_core.yml - Remove obsolete systemctl.service.j2 and script.py.j2 templates - Add unittest suite under tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py - Mock docker and systemctl calls in tests for safe execution Reference: ChatGPT conversation (see https://chatgpt.com/share/68ad770b-ea84-800f-b378-559cb61fc43a) --- roles/sys-ctl-rpr-docker-soft/files/script.py | 166 ++++++++++++++++++ .../sys-ctl-rpr-docker-soft/tasks/01_core.yml | 10 +- .../templates/script.py.j2 | 90 ---------- .../templates/systemctl.service.j2 | 8 - .../roles/sys-ctl-rpr-docker-soft/__init__.py | 0 .../sys-ctl-rpr-docker-soft/files/__init__.py | 0 .../files/test_script.py | 116 ++++++++++++ 7 files changed, 289 insertions(+), 101 deletions(-) create mode 100644 roles/sys-ctl-rpr-docker-soft/files/script.py delete mode 100644 roles/sys-ctl-rpr-docker-soft/templates/script.py.j2 delete mode 100644 roles/sys-ctl-rpr-docker-soft/templates/systemctl.service.j2 create mode 100644 tests/unit/roles/sys-ctl-rpr-docker-soft/__init__.py create mode 100644 tests/unit/roles/sys-ctl-rpr-docker-soft/files/__init__.py create mode 100644 tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py diff --git a/roles/sys-ctl-rpr-docker-soft/files/script.py b/roles/sys-ctl-rpr-docker-soft/files/script.py new file mode 100644 index 00000000..f9f80021 --- /dev/null +++ b/roles/sys-ctl-rpr-docker-soft/files/script.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Restart Docker-Compose configurations with exited or unhealthy containers. +This version receives the *manipulation services* via argparse (no Jinja). +""" +import subprocess +import time +import os +import argparse +from typing import List + + +def bash(command: str) -> List[str]: + print(command) + process = subprocess.Popen( + [command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True + ) + out, err = process.communicate() + stdout = out.splitlines() + stderr = err.decode("utf-8", errors="replace").strip() + output = [line.decode("utf-8", errors="replace") for line in stdout] + if process.returncode > 0: + print(command, out, err) + raise Exception(stderr or f"Command failed with code {process.returncode}") + return output + + +def list_to_string(lst: List[str]) -> str: + return " ".join(lst) + + +def print_bash(command: str) -> List[str]: + output = bash(command) + if output: + print(list_to_string(output)) + return output + + +def find_docker_compose_file(directory: str) -> str | None: + for root, _, files in os.walk(directory): + if "docker-compose.yml" in files: + return os.path.join(root, "docker-compose.yml") + return None + + +def normalize_services_arg(raw: List[str] | None, raw_str: str | None) -> List[str]: + """ + Accept either: + - multiple --manipulation SERVICE flags (nargs='*') + - a single --manipulation-string "svc1 svc2 ..." (space or comma separated) + """ + if raw: + return [s for s in raw if s.strip()] + if raw_str: + # split on comma or whitespace + parts = [p.strip() for chunk in raw_str.split(",") for p in chunk.split()] + return [p for p in parts if p] + return [] + +def wait_while_manipulation_running( + services: List[str], + waiting_time: int = 600, + timeout: int | None = None, +) -> None: + """ + Wait until none of the given services are active anymore. + Stops waiting if timeout (in seconds) is reached. + """ + if not services: + print("No manipulation services provided. Continuing without wait.") + return + + start = time.time() + while True: + any_active = False + for svc in services: + res = subprocess.run(f"systemctl is-active --quiet {svc}", shell=True) + if res.returncode == 0: + any_active = True + break + + if any_active: + # Check timeout + elapsed = time.time() - start + if timeout and elapsed >= timeout: + print(f"Timeout ({timeout}s) reached while waiting for services. Continuing anyway.") + break + print(f"Manipulation service is running. Trying again in {waiting_time} seconds.") + time.sleep(waiting_time) + else: + print("No blocking service is running.") + break + +def main(base_directory: str, manipulation_services: List[str], timeout: int | None) -> int: + errors = 0 + wait_while_manipulation_running(manipulation_services, waiting_time=600, timeout=timeout) + + unhealthy_container_names = print_bash( + "docker ps --filter health=unhealthy --format '{{{{.Names}}}}'" + ) + exited_container_names = print_bash( + "docker ps --filter status=exited --format '{{{{.Names}}}}'" + ) + failed_containers = unhealthy_container_names + exited_container_names + + unfiltered_failed_docker_compose_repositories = [ + container.split("-")[0] for container in failed_containers + ] + filtered_failed_docker_compose_repositories = list( + dict.fromkeys(unfiltered_failed_docker_compose_repositories) + ) + + for repo in filtered_failed_docker_compose_repositories: + compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo)) + + if compose_file_path: + print("Restarting unhealthy container in:", compose_file_path) + project_path = os.path.dirname(compose_file_path) + try: + print_bash(f'cd "{project_path}" && docker-compose -p "{repo}" restart') + except Exception as e: + if "port is already allocated" in str(e): + print("Detected port allocation problem. Executing recovery steps...") + print_bash(f'cd "{project_path}" && docker-compose down') + print_bash("systemctl restart docker") + print_bash(f'cd "{project_path}" && docker-compose -p "{repo}" up -d') + else: + print("Unhandled exception during restart:", e) + errors += 1 + else: + print("Error: Docker Compose file not found for:", repo) + errors += 1 + + print("Finished restart procedure.") + return errors + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Restart Docker-Compose configurations with exited or unhealthy containers." + ) + parser.add_argument( + "--manipulation", + metavar="SERVICE", + nargs="*", + help="Blocking systemd services to wait for (can be specified multiple times).", + ) + parser.add_argument( + "--manipulation-string", + type=str, + help='Blocking services as a single string (space- or comma-separated), e.g. "svc1 svc2" or "svc1,svc2".', + ) + parser.add_argument( + "--timeout", + type=int, + default=60, + help="Maximum time in seconds to wait for manipulation services before continuing.(Default 1min)", + ) + parser.add_argument( + "base_directory", + type=str, + help="Base directory where Docker Compose configurations are located.", + ) + args = parser.parse_args() + services = normalize_services_arg(args.manipulation, args.manipulation_string) + exit(main(args.base_directory, services, args.timeout)) diff --git a/roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml b/roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml index 2e957db5..80241c87 100644 --- a/roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml +++ b/roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml @@ -6,6 +6,10 @@ - include_role: name: sys-service vars: - system_service_on_calendar: "{{SYS_SCHEDULE_REPAIR_DOCKER_SOFT}}" - system_service_timer_enabled: true - system_service_tpl_on_failure: "{{ SYS_SERVICE_ON_FAILURE_COMPOSE }}" + system_service_on_calendar: "{{ SYS_SCHEDULE_REPAIR_DOCKER_SOFT }}" + system_service_timer_enabled: true + system_service_tpl_on_failure: "{{ SYS_SERVICE_ON_FAILURE_COMPOSE }}" + system_service_tpl_exec_start_pre: "/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{ SYS_SERVICE_GROUP_CLEANUP| join(' ') }} {{ SYS_SERVICE_REPAIR_DOCKER_SOFT }} --timeout '{{ SYS_TIMEOUT_HEAL_DOCKER }}'" + system_service_tpl_exec_start: > + /bin/sh -c '{{ system_service_script_exec }} --manipulation-string "{{ SYS_SERVICE_GROUP_MANIPULATION | join(" ") }}" {{ PATH_DOCKER_COMPOSE_INSTANCES }}' + diff --git a/roles/sys-ctl-rpr-docker-soft/templates/script.py.j2 b/roles/sys-ctl-rpr-docker-soft/templates/script.py.j2 deleted file mode 100644 index 6d640788..00000000 --- a/roles/sys-ctl-rpr-docker-soft/templates/script.py.j2 +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/python -# -# Restart Docker-Compose configurations with exited or unhealthy containers -# -import subprocess -import time -import os -import argparse - -def bash(command): - print(command) - process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out, err = process.communicate() - stdout = out.splitlines() - stderr = err.decode("utf-8").strip() # decode stderr - output = [line.decode("utf-8") for line in stdout] - if process.returncode > 0: - print(command, out, err) - raise Exception(stderr) # pass the actual error text - return output - -def list_to_string(lst): - return ' '.join(lst) - -def print_bash(command): - output = bash(command) - print(list_to_string(output)) - return output - -def find_docker_compose_file(directory): - for root, _, files in os.walk(directory): - if 'docker-compose.yml' in files: - return os.path.join(root, 'docker-compose.yml') - return None - -def main(base_directory): - errors = 0 - waiting_time = 600 - blocker_running = True - - while blocker_running: - try: -{% for manipulation_service in SYS_SERVICE_GROUP_MANIPULATION %} - bash("systemctl is-active --quiet {{ manipulation_service }}") -{% endfor %} - print("Manipulation service is running.") - print(f"Trying again in {waiting_time} seconds.") - time.sleep(waiting_time) - except: - blocker_running = False - print("No blocking service is running.") - - unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{% raw %}{{.Names}}{% endraw %}'") - exited_container_names = print_bash("docker ps --filter status=exited --format '{% raw %}{{.Names}}{% endraw %}'") - failed_containers = unhealthy_container_names + exited_container_names - - unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers] - filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories)) - - for repo in filtered_failed_docker_compose_repositories: - compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo)) - - if compose_file_path: - print("Restarting unhealthy container in:", compose_file_path) - project_path = os.path.dirname(compose_file_path) - try: - print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart') - except Exception as e: - if "port is already allocated" in str(e): - print("Detected port allocation problem. Executing recovery steps...") - print_bash(f'cd {project_path} && docker-compose down') - print_bash('systemctl restart docker') - print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d') - else: - print("Unhandled exception during restart:", e) - errors += 1 - else: - print("Error: Docker Compose file not found for:", repo) - errors += 1 - - - print("Finished restart procedure.") - exit(errors) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.") - parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.") - args = parser.parse_args() - - main(args.base_directory) \ No newline at end of file diff --git a/roles/sys-ctl-rpr-docker-soft/templates/systemctl.service.j2 b/roles/sys-ctl-rpr-docker-soft/templates/systemctl.service.j2 deleted file mode 100644 index 83e8ef57..00000000 --- a/roles/sys-ctl-rpr-docker-soft/templates/systemctl.service.j2 +++ /dev/null @@ -1,8 +0,0 @@ -[Unit] -Description=restart unhealthy docker containers -OnFailure={{ SYS_SERVICE_ON_FAILURE_COMPOSE }} - -[Service] -Type=oneshot -ExecStartPre=/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{ SYS_SERVICE_GROUP_CLEANUP| join(' ') }} {{ SYS_SERVICE_REPAIR_DOCKER_SOFT }} --timeout "{{ SYS_TIMEOUT_HEAL_DOCKER }}" -ExecStart=/bin/sh -c '{{ system_service_script_exec }} {{ PATH_DOCKER_COMPOSE_INSTANCES }}' \ No newline at end of file diff --git a/tests/unit/roles/sys-ctl-rpr-docker-soft/__init__.py b/tests/unit/roles/sys-ctl-rpr-docker-soft/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/roles/sys-ctl-rpr-docker-soft/files/__init__.py b/tests/unit/roles/sys-ctl-rpr-docker-soft/files/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py b/tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py new file mode 100644 index 00000000..8d49e48d --- /dev/null +++ b/tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py @@ -0,0 +1,116 @@ +import unittest +import types +import sys +from pathlib import Path +from importlib.util import spec_from_file_location, module_from_spec + + +def load_script_module(): + """ + Import the script under test from roles/sys-ctl-rpr-docker-soft/files/script.py + """ + test_file = Path(__file__).resolve() + repo_root = test_file.parents[5] # .../tests/unit/roles/sys-ctl-rpr-docker-soft/files -> repo root + script_path = repo_root / "roles" / "sys-ctl-rpr-docker-soft" / "files" / "script.py" + if not script_path.exists(): + raise FileNotFoundError(f"script.py not found at {script_path}") + spec = spec_from_file_location("rpr_soft_script", str(script_path)) + mod = module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(mod) # type: ignore[attr-defined] + return mod + + +class TestRepairDockerSoft(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.script = load_script_module() + + def test_normalize_services_arg(self): + s = self.script + self.assertEqual( + s.normalize_services_arg(["svc-a.service", " ", "svc-b.service"], None), + ["svc-a.service", "svc-b.service"], + ) + self.assertEqual( + s.normalize_services_arg(None, "svc-a.service svc-b.service"), + ["svc-a.service", "svc-b.service"], + ) + self.assertEqual( + s.normalize_services_arg(None, "svc-a.service, svc-b.service, svc-c.service"), + ["svc-a.service", "svc-b.service", "svc-c.service"], + ) + self.assertEqual(s.normalize_services_arg([], ""), []) + + def test_wait_while_manipulation_running_respects_timeout(self): + s = self.script + calls = {"checks": 0, "sleeps": 0} + t = {"now": 0} + + def fake_run(cmd, shell): + self.assertIn("systemctl is-active --quiet", cmd) + calls["checks"] += 1 + return types.SimpleNamespace(returncode=0) + + def fake_sleep(_secs): + calls["sleeps"] += 1 + + def fake_time(): + # each call advances time by 610s + t["now"] += 610 + return t["now"] + + old_run = s.subprocess.run + old_sleep = s.time.sleep + old_time = s.time.time + try: + s.subprocess.run = fake_run + s.time.sleep = fake_sleep + s.time.time = fake_time + + s.wait_while_manipulation_running(["svc-a", "svc-b"], waiting_time=600, timeout=1200) + + self.assertGreaterEqual(calls["sleeps"], 1) + self.assertGreaterEqual(calls["checks"], 1) + finally: + s.subprocess.run = old_run + s.time.sleep = old_sleep + s.time.time = old_time + + def test_main_restarts_and_counts_errors(self): + s = self.script + cmd_log = [] + + def fake_print_bash(cmd): + cmd_log.append(cmd) + if cmd.startswith("docker ps --filter health=unhealthy"): + return ["app1-web-1", "db-1"] + if cmd.startswith("docker ps --filter status=exited"): + return ["app1-worker-1", "other-2"] + if "docker-compose" in cmd: + return [] + return [] + + def fake_find_docker_compose(path): + if path.endswith("/app1") or path.endswith("/db"): + return str(Path(path) / "docker-compose.yml") + return None + + old_print_bash = s.print_bash + old_find = s.find_docker_compose_file + try: + s.print_bash = fake_print_bash + s.find_docker_compose_file = fake_find_docker_compose # <-- jetzt gleicher Name! + + errors = s.main("/BASE", manipulation_services=[], timeout=None) + self.assertEqual(errors, 1) + + restart_cmds = [c for c in cmd_log if "docker-compose -p" in c and " restart" in c] + self.assertTrue(any('cd "/BASE/app1"' in c and 'docker-compose -p "app1" restart' in c for c in restart_cmds)) + self.assertTrue(any('cd "/BASE/db"' in c and 'docker-compose -p "db" restart' in c for c in restart_cmds)) + finally: + s.print_bash = old_print_bash + s.find_docker_compose_file = old_find + +if __name__ == "__main__": + unittest.main()