Refactor sys-ctl-rpr-docker-soft role to use standalone Python script with argparse and unittests

- Replace Jinja2 template (script.py.j2) with raw Python script (files/script.py)
- Add argparse options: --manipulation, --manipulation-string, --timeout
- Implement timeout handling in wait_while_manipulation_running
- Update systemd ExecStart/ExecStartPre handling in tasks/01_core.yml
- Remove obsolete systemctl.service.j2 and script.py.j2 templates
- Add unittest suite under tests/unit/roles/sys-ctl-rpr-docker-soft/files/test_script.py
- Mock docker and systemctl calls in tests for safe execution

Reference: ChatGPT conversation (see https://chatgpt.com/share/68ad770b-ea84-800f-b378-559cb61fc43a)
This commit is contained in:
2025-08-26 10:58:17 +02:00
parent 7ad14673e1
commit e417bc19bd
7 changed files with 289 additions and 101 deletions

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Restart Docker-Compose configurations with exited or unhealthy containers.
This version receives the *manipulation services* via argparse (no Jinja).
"""
import subprocess
import time
import os
import argparse
from typing import List
def bash(command: str) -> List[str]:
print(command)
process = subprocess.Popen(
[command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
)
out, err = process.communicate()
stdout = out.splitlines()
stderr = err.decode("utf-8", errors="replace").strip()
output = [line.decode("utf-8", errors="replace") for line in stdout]
if process.returncode > 0:
print(command, out, err)
raise Exception(stderr or f"Command failed with code {process.returncode}")
return output
def list_to_string(lst: List[str]) -> str:
return " ".join(lst)
def print_bash(command: str) -> List[str]:
output = bash(command)
if output:
print(list_to_string(output))
return output
def find_docker_compose_file(directory: str) -> str | None:
for root, _, files in os.walk(directory):
if "docker-compose.yml" in files:
return os.path.join(root, "docker-compose.yml")
return None
def normalize_services_arg(raw: List[str] | None, raw_str: str | None) -> List[str]:
"""
Accept either:
- multiple --manipulation SERVICE flags (nargs='*')
- a single --manipulation-string "svc1 svc2 ..." (space or comma separated)
"""
if raw:
return [s for s in raw if s.strip()]
if raw_str:
# split on comma or whitespace
parts = [p.strip() for chunk in raw_str.split(",") for p in chunk.split()]
return [p for p in parts if p]
return []
def wait_while_manipulation_running(
services: List[str],
waiting_time: int = 600,
timeout: int | None = None,
) -> None:
"""
Wait until none of the given services are active anymore.
Stops waiting if timeout (in seconds) is reached.
"""
if not services:
print("No manipulation services provided. Continuing without wait.")
return
start = time.time()
while True:
any_active = False
for svc in services:
res = subprocess.run(f"systemctl is-active --quiet {svc}", shell=True)
if res.returncode == 0:
any_active = True
break
if any_active:
# Check timeout
elapsed = time.time() - start
if timeout and elapsed >= timeout:
print(f"Timeout ({timeout}s) reached while waiting for services. Continuing anyway.")
break
print(f"Manipulation service is running. Trying again in {waiting_time} seconds.")
time.sleep(waiting_time)
else:
print("No blocking service is running.")
break
def main(base_directory: str, manipulation_services: List[str], timeout: int | None) -> int:
errors = 0
wait_while_manipulation_running(manipulation_services, waiting_time=600, timeout=timeout)
unhealthy_container_names = print_bash(
"docker ps --filter health=unhealthy --format '{{{{.Names}}}}'"
)
exited_container_names = print_bash(
"docker ps --filter status=exited --format '{{{{.Names}}}}'"
)
failed_containers = unhealthy_container_names + exited_container_names
unfiltered_failed_docker_compose_repositories = [
container.split("-")[0] for container in failed_containers
]
filtered_failed_docker_compose_repositories = list(
dict.fromkeys(unfiltered_failed_docker_compose_repositories)
)
for repo in filtered_failed_docker_compose_repositories:
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
if compose_file_path:
print("Restarting unhealthy container in:", compose_file_path)
project_path = os.path.dirname(compose_file_path)
try:
print_bash(f'cd "{project_path}" && docker-compose -p "{repo}" restart')
except Exception as e:
if "port is already allocated" in str(e):
print("Detected port allocation problem. Executing recovery steps...")
print_bash(f'cd "{project_path}" && docker-compose down')
print_bash("systemctl restart docker")
print_bash(f'cd "{project_path}" && docker-compose -p "{repo}" up -d')
else:
print("Unhandled exception during restart:", e)
errors += 1
else:
print("Error: Docker Compose file not found for:", repo)
errors += 1
print("Finished restart procedure.")
return errors
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Restart Docker-Compose configurations with exited or unhealthy containers."
)
parser.add_argument(
"--manipulation",
metavar="SERVICE",
nargs="*",
help="Blocking systemd services to wait for (can be specified multiple times).",
)
parser.add_argument(
"--manipulation-string",
type=str,
help='Blocking services as a single string (space- or comma-separated), e.g. "svc1 svc2" or "svc1,svc2".',
)
parser.add_argument(
"--timeout",
type=int,
default=60,
help="Maximum time in seconds to wait for manipulation services before continuing.(Default 1min)",
)
parser.add_argument(
"base_directory",
type=str,
help="Base directory where Docker Compose configurations are located.",
)
args = parser.parse_args()
services = normalize_services_arg(args.manipulation, args.manipulation_string)
exit(main(args.base_directory, services, args.timeout))

View File

@@ -6,6 +6,10 @@
- include_role:
name: sys-service
vars:
system_service_on_calendar: "{{SYS_SCHEDULE_REPAIR_DOCKER_SOFT}}"
system_service_timer_enabled: true
system_service_tpl_on_failure: "{{ SYS_SERVICE_ON_FAILURE_COMPOSE }}"
system_service_on_calendar: "{{ SYS_SCHEDULE_REPAIR_DOCKER_SOFT }}"
system_service_timer_enabled: true
system_service_tpl_on_failure: "{{ SYS_SERVICE_ON_FAILURE_COMPOSE }}"
system_service_tpl_exec_start_pre: "/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{ SYS_SERVICE_GROUP_CLEANUP| join(' ') }} {{ SYS_SERVICE_REPAIR_DOCKER_SOFT }} --timeout '{{ SYS_TIMEOUT_HEAL_DOCKER }}'"
system_service_tpl_exec_start: >
/bin/sh -c '{{ system_service_script_exec }} --manipulation-string "{{ SYS_SERVICE_GROUP_MANIPULATION | join(" ") }}" {{ PATH_DOCKER_COMPOSE_INSTANCES }}'

View File

@@ -1,90 +0,0 @@
#!/bin/python
#
# Restart Docker-Compose configurations with exited or unhealthy containers
#
import subprocess
import time
import os
import argparse
def bash(command):
print(command)
process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out, err = process.communicate()
stdout = out.splitlines()
stderr = err.decode("utf-8").strip() # decode stderr
output = [line.decode("utf-8") for line in stdout]
if process.returncode > 0:
print(command, out, err)
raise Exception(stderr) # pass the actual error text
return output
def list_to_string(lst):
return ' '.join(lst)
def print_bash(command):
output = bash(command)
print(list_to_string(output))
return output
def find_docker_compose_file(directory):
for root, _, files in os.walk(directory):
if 'docker-compose.yml' in files:
return os.path.join(root, 'docker-compose.yml')
return None
def main(base_directory):
errors = 0
waiting_time = 600
blocker_running = True
while blocker_running:
try:
{% for manipulation_service in SYS_SERVICE_GROUP_MANIPULATION %}
bash("systemctl is-active --quiet {{ manipulation_service }}")
{% endfor %}
print("Manipulation service is running.")
print(f"Trying again in {waiting_time} seconds.")
time.sleep(waiting_time)
except:
blocker_running = False
print("No blocking service is running.")
unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{% raw %}{{.Names}}{% endraw %}'")
exited_container_names = print_bash("docker ps --filter status=exited --format '{% raw %}{{.Names}}{% endraw %}'")
failed_containers = unhealthy_container_names + exited_container_names
unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers]
filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories))
for repo in filtered_failed_docker_compose_repositories:
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
if compose_file_path:
print("Restarting unhealthy container in:", compose_file_path)
project_path = os.path.dirname(compose_file_path)
try:
print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart')
except Exception as e:
if "port is already allocated" in str(e):
print("Detected port allocation problem. Executing recovery steps...")
print_bash(f'cd {project_path} && docker-compose down')
print_bash('systemctl restart docker')
print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d')
else:
print("Unhandled exception during restart:", e)
errors += 1
else:
print("Error: Docker Compose file not found for:", repo)
errors += 1
print("Finished restart procedure.")
exit(errors)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.")
parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.")
args = parser.parse_args()
main(args.base_directory)

View File

@@ -1,8 +0,0 @@
[Unit]
Description=restart unhealthy docker containers
OnFailure={{ SYS_SERVICE_ON_FAILURE_COMPOSE }}
[Service]
Type=oneshot
ExecStartPre=/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{ SYS_SERVICE_GROUP_CLEANUP| join(' ') }} {{ SYS_SERVICE_REPAIR_DOCKER_SOFT }} --timeout "{{ SYS_TIMEOUT_HEAL_DOCKER }}"
ExecStart=/bin/sh -c '{{ system_service_script_exec }} {{ PATH_DOCKER_COMPOSE_INSTANCES }}'