Another big round of refactoring and cleaning...

This commit is contained in:
2025-07-11 17:55:26 +02:00
parent aa61bf2a44
commit 168c5c0da6
323 changed files with 761 additions and 811 deletions

View File

@@ -0,0 +1,28 @@
# Docker Healer 🩺
## Description
This Ansible role automatically restarts Docker Compose configurations with exited or unhealthy containers on Arch Linux systems. It ensures the stability of containerized workloads by recovering from common error conditions like port binding issues.
## Overview
Tailored for Arch Linux, this role monitors containers for failure states and initiates a controlled restart of affected Compose configurations. If port conflicts prevent recovery, the role stops the affected stack, restarts Docker, and recreates the container environment.
## Purpose
The purpose of this role is to provide automated healing for Docker Compose environments, minimizing manual recovery effort and reducing downtime.
## Features
- **Container Health Monitoring:** Detects unhealthy or exited containers.
- **Automated Recovery:** Restarts failed containers and resolves port binding issues.
- **Run-once Setup Logic:** Ensures idempotent execution by controlling task flow with internal flags.
- **System Role Integration:** Seamlessly integrates with CyMaIS system maintenance logic.
## Credits 📝
Developed and maintained by **Kevin Veen-Birkenbach**.
Learn more at [www.veen.world](https://www.veen.world)
Part of the [CyMaIS Project](https://github.com/kevinveenbirkenbach/cymais)
License: [CyMaIS NonCommercial License (CNCL)](https://s.veen.world/cncl)

View File

@@ -0,0 +1,89 @@
#!/bin/python
#
# Restart Docker-Compose configurations with exited or unhealthy containers
#
import subprocess
import time
import os
import argparse
def bash(command):
print(command)
process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out, err = process.communicate()
stdout = out.splitlines()
stderr = err.decode("utf-8").strip() # decode stderr
output = [line.decode("utf-8") for line in stdout]
if process.returncode > 0:
print(command, out, err)
raise Exception(stderr) # pass the actual error text
return output
def list_to_string(lst):
return ' '.join(lst)
def print_bash(command):
output = bash(command)
print(list_to_string(output))
return output
def find_docker_compose_file(directory):
for root, _, files in os.walk(directory):
if 'docker-compose.yml' in files:
return os.path.join(root, 'docker-compose.yml')
return None
def main(base_directory):
errors = 0
waiting_time = 600
blocker_running = True
while blocker_running:
try:
bash("systemctl is-active --quiet sys-bkp-docker-to-local.cymais.service")
bash("systemctl is-active --quiet update-docker.cymais.service")
print("Backup is running.")
print(f"Trying again in {waiting_time} seconds.")
time.sleep(waiting_time)
except:
blocker_running = False
print("No blocking service is running.")
unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{{.Names}}'")
exited_container_names = print_bash("docker ps --filter status=exited --format '{{.Names}}'")
failed_containers = unhealthy_container_names + exited_container_names
unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers]
filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories))
for repo in filtered_failed_docker_compose_repositories:
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
if compose_file_path:
print("Restarting unhealthy container in:", compose_file_path)
project_path = os.path.dirname(compose_file_path)
try:
print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart')
except Exception as e:
if "port is already allocated" in str(e):
print("Detected port allocation problem. Executing recovery steps...")
print_bash(f'cd {project_path} && docker-compose down')
print_bash('systemctl restart docker')
print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d')
else:
print("Unhandled exception during restart:", e)
errors += 1
else:
print("Error: Docker Compose file not found for:", repo)
errors += 1
print("Finished restart procedure.")
exit(errors)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.")
parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.")
args = parser.parse_args()
main(args.base_directory)

View File

@@ -0,0 +1,5 @@
- name: restart sys-rpr-docker-soft.cymais.service
systemd:
name: sys-rpr-docker-soft.cymais.service
state: restarted
daemon_reload: yes

View File

@@ -0,0 +1,26 @@
---
galaxy_info:
author: "Kevin Veen-Birkenbach"
description: "Automated recovery for unhealthy or exited Docker Compose containers."
license: "CyMaIS NonCommercial License (CNCL)"
license_url: "https://s.veen.world/cncl"
company: |
Kevin Veen-Birkenbach
Consulting & Coaching Solutions
https://www.veen.world
min_ansible_version: "2.9"
platforms:
- name: Archlinux
versions:
- rolling
galaxy_tags:
- docker
- docker-compose
- systemd
- automation
- archlinux
repository: https://s.veen.world/cymais
issue_tracker_url: https://s.veen.world/cymaisissues
documentation: https://s.veen.world/cymais
dependencies:
- sys-lock

View File

@@ -0,0 +1,37 @@
- name: "create {{heal_docker}}"
file:
path: "{{heal_docker}}"
state: directory
mode: 0755
when: run_once_heal_docker is not defined
- name: create sys-rpr-docker-soft.py
copy:
src: sys-rpr-docker-soft.py
dest: "{{heal_docker}}sys-rpr-docker-soft.py"
notify: restart sys-rpr-docker-soft.cymais.service
when: run_once_heal_docker is not defined
- name: create sys-rpr-docker-soft.cymais.service
template:
src: sys-rpr-docker-soft.service.j2
dest: /etc/systemd/system/sys-rpr-docker-soft.cymais.service
notify: restart sys-rpr-docker-soft.cymais.service
when: run_once_heal_docker is not defined
- name: "set 'service_name' to '{{ role_name }}'"
set_fact:
service_name: "{{ role_name }}"
when: run_once_heal_docker is not defined
- name: "include role for sys-timer for {{service_name}}"
include_role:
name: sys-timer
vars:
on_calendar: "{{on_calendar_heal_docker}}"
when: run_once_heal_docker is not defined
- name: run the heal_docker tasks once
set_fact:
run_once_heal_docker: true
when: run_once_heal_docker is not defined

View File

@@ -0,0 +1,8 @@
[Unit]
Description=restart unhealthy docker containers
OnFailure=sys-alm-compose.cymais@%n.service
[Service]
Type=oneshot
ExecStartPre=/bin/sh -c '/usr/bin/python {{ path_system_lock_script }} {{ system_maintenance_services | join(' ') }} --ignore {{system_maintenance_cleanup_services| join(' ') }} sys-rpr-docker-soft --timeout "{{system_maintenance_lock_timeout_heal_docker}}"'
ExecStart=/bin/sh -c '/bin/python {{heal_docker}}sys-rpr-docker-soft.py {{path_docker_compose_instances}}'

View File

@@ -0,0 +1,2 @@
heal_docker: '{{path_administrator_scripts}}sys-rpr-docker-soft/'