Refactor systemctl services and categories due to alarm bugs

This commit restructures systemctl service definitions and category mappings.

Motivation: Alarm-related bugs revealed inconsistencies in service and role handling.

Preparation step: lays the groundwork for fixing the alarm issues by aligning categories, roles, and service templates.
This commit is contained in:
2025-08-18 13:35:43 +02:00
parent 29f50da226
commit 3a839cfe37
289 changed files with 975 additions and 948 deletions

View File

@@ -0,0 +1,28 @@
# Docker Healer 🩺
## Description
This Ansible role automatically restarts Docker Compose configurations with exited or unhealthy containers on Arch Linux systems. It ensures the stability of containerized workloads by recovering from common error conditions like port binding issues.
## Overview
Tailored for Arch Linux, this role monitors containers for failure states and initiates a controlled restart of affected Compose configurations. If port conflicts prevent recovery, the role stops the affected stack, restarts Docker, and recreates the container environment.
## Purpose
The purpose of this role is to provide automated healing for Docker Compose environments, minimizing manual recovery effort and reducing downtime.
## Features
- **Container Health Monitoring:** Detects unhealthy or exited containers.
- **Automated Recovery:** Restarts failed containers and resolves port binding issues.
- **Run-once Setup Logic:** Ensures idempotent execution by controlling task flow with internal flags.
- **System Role Integration:** Seamlessly integrates with Infinito.Nexus system maintenance logic.
## Credits 📝
Developed and maintained by **Kevin Veen-Birkenbach**.
Learn more at [www.veen.world](https://www.veen.world)
Part of the [Infinito.Nexus Project](https://s.infinito.nexus/code)
License: [Infinito.Nexus NonCommercial License](https://s.infinito.nexus/license)

View File

@@ -0,0 +1,5 @@
- name: restart sys-ctl-rpr-docker-soft service
systemd:
name: sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
state: restarted
daemon_reload: yes

View File

@@ -0,0 +1,23 @@
galaxy_info:
author: "Kevin Veen-Birkenbach"
description: "Automated recovery for unhealthy or exited Docker Compose containers."
license: "Infinito.Nexus NonCommercial License"
license_url: "https://s.infinito.nexus/license"
company: |
Kevin Veen-Birkenbach
Consulting & Coaching Solutions
https://www.veen.world
min_ansible_version: "2.9"
platforms:
- name: Archlinux
versions:
- rolling
galaxy_tags:
- docker
- docker-compose
- systemd
- automation
- archlinux
repository: https://s.infinito.nexus/code
issue_tracker_url: https://s.infinito.nexus/issues
documentation: "https://docs.infinito.nexus/"

View File

@@ -0,0 +1,32 @@
- name: Include dependency 'sys-lock'
include_role:
name: sys-lock
when: run_once_sys_lock is not defined
- name: "create {{heal_docker}}"
file:
path: "{{heal_docker}}"
state: directory
mode: "0755"
- name: create sys-ctl-rpr-docker-soft.py
template:
src: sys-ctl-rpr-docker-soft.py.j2
dest: "{{heal_docker}}sys-ctl-rpr-docker-soft.py"
notify: restart sys-ctl-rpr-docker-soft service
- name: create sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
template:
src: sys-ctl-rpr-docker-soft.service.j2
dest: /etc/systemd/system/sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
notify: restart sys-ctl-rpr-docker-soft service
- name: "set 'service_name' to '{{ role_name }}'"
set_fact:
service_name: "{{ role_name }}"
- name: "include role for sys-timer for {{ service_name }}"
include_role:
name: sys-timer
vars:
on_calendar: "{{SYS_SCHEDULE_REPAIR_DOCKER_SOFT}}"

View File

@@ -0,0 +1,4 @@
- block:
- include_tasks: 01_core.yml
- include_tasks: utils/run_once.yml
when: run_once_sys_ctl_rpr_docker_soft is not defined

View File

@@ -0,0 +1,89 @@
#!/bin/python
#
# Restart Docker-Compose configurations with exited or unhealthy containers
#
import subprocess
import time
import os
import argparse
def bash(command):
print(command)
process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
out, err = process.communicate()
stdout = out.splitlines()
stderr = err.decode("utf-8").strip() # decode stderr
output = [line.decode("utf-8") for line in stdout]
if process.returncode > 0:
print(command, out, err)
raise Exception(stderr) # pass the actual error text
return output
def list_to_string(lst):
return ' '.join(lst)
def print_bash(command):
output = bash(command)
print(list_to_string(output))
return output
def find_docker_compose_file(directory):
for root, _, files in os.walk(directory):
if 'docker-compose.yml' in files:
return os.path.join(root, 'docker-compose.yml')
return None
def main(base_directory):
errors = 0
waiting_time = 600
blocker_running = True
while blocker_running:
try:
bash("systemctl is-active --quiet sys-ctl-bkp-docker-2-loc{{ SYS_SERVICE_SUFFIX }}")
bash("systemctl is-active --quiet update-docker{{ SYS_SERVICE_SUFFIX }}")
print("Backup is running.")
print(f"Trying again in {waiting_time} seconds.")
time.sleep(waiting_time)
except:
blocker_running = False
print("No blocking service is running.")
unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{% raw %}{{.Names}}{% endraw %}'")
exited_container_names = print_bash("docker ps --filter status=exited --format '{% raw %}{{.Names}}{% endraw %}'")
failed_containers = unhealthy_container_names + exited_container_names
unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers]
filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories))
for repo in filtered_failed_docker_compose_repositories:
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
if compose_file_path:
print("Restarting unhealthy container in:", compose_file_path)
project_path = os.path.dirname(compose_file_path)
try:
print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart')
except Exception as e:
if "port is already allocated" in str(e):
print("Detected port allocation problem. Executing recovery steps...")
print_bash(f'cd {project_path} && docker-compose down')
print_bash('systemctl restart docker')
print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d')
else:
print("Unhandled exception during restart:", e)
errors += 1
else:
print("Error: Docker Compose file not found for:", repo)
errors += 1
print("Finished restart procedure.")
exit(errors)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.")
parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.")
args = parser.parse_args()
main(args.base_directory)

View File

@@ -0,0 +1,8 @@
[Unit]
Description=restart unhealthy docker containers
OnFailure=sys-ctl-alm-compose.{{ SOFTWARE_NAME }}@%n.service
[Service]
Type=oneshot
ExecStartPre=/bin/sh -c '/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{SYS_SERVICE_GROUP_CLEANUP| join(' ') }} sys-ctl-rpr-docker-soft --timeout "{{SYS_TIMEOUT_HEAL_DOCKER}}"'
ExecStart=/bin/sh -c '/bin/python {{heal_docker}}sys-ctl-rpr-docker-soft.py {{ PATH_DOCKER_COMPOSE_INSTANCES }}'

View File

@@ -0,0 +1,2 @@
heal_docker: '{{ PATH_ADMINISTRATOR_SCRIPTS }}sys-ctl-rpr-docker-soft/'