Shorted maintenance- to maint-

This commit is contained in:
2025-07-09 03:25:03 +02:00
parent ae5f021b8d
commit d0bd33fee3
63 changed files with 96 additions and 96 deletions

View File

@@ -0,0 +1,28 @@
# Docker Healer 🩺
## Description
This Ansible role automatically restarts Docker Compose configurations with exited or unhealthy containers on Arch Linux systems. It ensures the stability of containerized workloads by recovering from common error conditions like port binding issues.
## Overview
Tailored for Arch Linux, this role monitors containers for failure states and initiates a controlled restart of affected Compose configurations. If port conflicts prevent recovery, the role stops the affected stack, restarts Docker, and recreates the container environment.
## Purpose
The purpose of this role is to provide automated healing for Docker Compose environments, minimizing manual recovery effort and reducing downtime.
## Features
- **Container Health Monitoring:** Detects unhealthy or exited containers.
- **Automated Recovery:** Restarts failed containers and resolves port binding issues.
- **Run-once Setup Logic:** Ensures idempotent execution by controlling task flow with internal flags.
- **System Role Integration:** Seamlessly integrates with CyMaIS system maintenance logic.
## Credits 📝
Developed and maintained by **Kevin Veen-Birkenbach**.
Learn more at [www.veen.world](https://www.veen.world)
Part of the [CyMaIS Project](https://github.com/kevinveenbirkenbach/cymais)
License: [CyMaIS NonCommercial License (CNCL)](https://s.veen.world/cncl)

View File

@@ -0,0 +1,89 @@
#!/bin/python
#
# Restart Docker-Compose configurations with exited or unhealthy containers
#
import subprocess
import time
import os
import argparse
def bash(command):
print(command)
process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out, err = process.communicate()
stdout = out.splitlines()
stderr = err.decode("utf-8").strip() # decode stderr
output = [line.decode("utf-8") for line in stdout]
if process.returncode > 0:
print(command, out, err)
raise Exception(stderr) # pass the actual error text
return output
def list_to_string(lst):
return ' '.join(lst)
def print_bash(command):
output = bash(command)
print(list_to_string(output))
return output
def find_docker_compose_file(directory):
for root, _, files in os.walk(directory):
if 'docker-compose.yml' in files:
return os.path.join(root, 'docker-compose.yml')
return None
def main(base_directory):
errors = 0
waiting_time = 600
blocker_running = True
while blocker_running:
try:
bash("systemctl is-active --quiet backup-docker-to-local.cymais.service")
bash("systemctl is-active --quiet update-docker.cymais.service")
print("Backup is running.")
print(f"Trying again in {waiting_time} seconds.")
time.sleep(waiting_time)
except:
blocker_running = False
print("No blocking service is running.")
unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{{.Names}}'")
exited_container_names = print_bash("docker ps --filter status=exited --format '{{.Names}}'")
failed_containers = unhealthy_container_names + exited_container_names
unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers]
filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories))
for repo in filtered_failed_docker_compose_repositories:
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
if compose_file_path:
print("Restarting unhealthy container in:", compose_file_path)
project_path = os.path.dirname(compose_file_path)
try:
print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart')
except Exception as e:
if "port is already allocated" in str(e):
print("Detected port allocation problem. Executing recovery steps...")
print_bash(f'cd {project_path} && docker-compose down')
print_bash('systemctl restart docker')
print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d')
else:
print("Unhandled exception during restart:", e)
errors += 1
else:
print("Error: Docker Compose file not found for:", repo)
errors += 1
print("Finished restart procedure.")
exit(errors)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.")
parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.")
args = parser.parse_args()
main(args.base_directory)

View File

@@ -0,0 +1,5 @@
- name: restart maint-docker-heal.cymais.service
systemd:
name: maint-docker-heal.cymais.service
state: restarted
daemon_reload: yes

View File

@@ -0,0 +1,26 @@
---
galaxy_info:
author: "Kevin Veen-Birkenbach"
description: "Automated recovery for unhealthy or exited Docker Compose containers."
license: "CyMaIS NonCommercial License (CNCL)"
license_url: "https://s.veen.world/cncl"
company: |
Kevin Veen-Birkenbach
Consulting & Coaching Solutions
https://www.veen.world
min_ansible_version: "2.9"
platforms:
- name: Archlinux
versions:
- rolling
galaxy_tags:
- docker
- docker-compose
- systemd
- automation
- archlinux
repository: https://s.veen.world/cymais
issue_tracker_url: https://s.veen.world/cymaisissues
documentation: https://s.veen.world/cymais
dependencies:
- maint-lock

View File

@@ -0,0 +1,37 @@
- name: "create {{heal_docker}}"
file:
path: "{{heal_docker}}"
state: directory
mode: 0755
when: run_once_heal_docker is not defined
- name: create maint-docker-heal.py
copy:
src: maint-docker-heal.py
dest: "{{heal_docker}}maint-docker-heal.py"
notify: restart maint-docker-heal.cymais.service
when: run_once_heal_docker is not defined
- name: create maint-docker-heal.cymais.service
template:
src: maint-docker-heal.service.j2
dest: /etc/systemd/system/maint-docker-heal.cymais.service
notify: restart maint-docker-heal.cymais.service
when: run_once_heal_docker is not defined
- name: set service_name to the name of the current role
set_fact:
service_name: "{{ role_name }}"
when: run_once_heal_docker is not defined
- name: "include role for generic-timer for {{service_name}}"
include_role:
name: generic-timer
vars:
on_calendar: "{{on_calendar_heal_docker}}"
when: run_once_heal_docker is not defined
- name: run the heal_docker tasks once
set_fact:
run_once_heal_docker: true
when: run_once_heal_docker is not defined

View File

@@ -0,0 +1,8 @@
[Unit]
Description=restart unhealthy docker containers
OnFailure=alert-compose.cymais@%n.service
[Service]
Type=oneshot
ExecStartPre=/bin/sh -c '/usr/bin/python {{ path_system_lock_script }} {{ system_maintenance_services | join(' ') }} --ignore {{system_maintenance_cleanup_services| join(' ') }} maint-docker-heal --timeout "{{system_maintenance_lock_timeout_heal_docker}}"'
ExecStart=/bin/sh -c '/bin/python {{heal_docker}}maint-docker-heal.py {{path_docker_compose_instances}}'

View File

@@ -0,0 +1 @@
heal_docker: "{{path_administrator_scripts}}maint-docker-heal/"