mirror of
https://github.com/kevinveenbirkenbach/computer-playbook.git
synced 2025-08-30 15:28:12 +02:00
Refactor systemctl services and categories due to alarm bugs
This commit restructures systemctl service definitions and category mappings. Motivation: Alarm-related bugs revealed inconsistencies in service and role handling. Preparation step: lays the groundwork for fixing the alarm issues by aligning categories, roles, and service templates.
This commit is contained in:
28
roles/sys-ctl-rpr-docker-soft/README.md
Normal file
28
roles/sys-ctl-rpr-docker-soft/README.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Docker Healer 🩺
|
||||
|
||||
## Description
|
||||
|
||||
This Ansible role automatically restarts Docker Compose configurations with exited or unhealthy containers on Arch Linux systems. It ensures the stability of containerized workloads by recovering from common error conditions like port binding issues.
|
||||
|
||||
## Overview
|
||||
|
||||
Tailored for Arch Linux, this role monitors containers for failure states and initiates a controlled restart of affected Compose configurations. If port conflicts prevent recovery, the role stops the affected stack, restarts Docker, and recreates the container environment.
|
||||
|
||||
## Purpose
|
||||
|
||||
The purpose of this role is to provide automated healing for Docker Compose environments, minimizing manual recovery effort and reducing downtime.
|
||||
|
||||
## Features
|
||||
|
||||
- **Container Health Monitoring:** Detects unhealthy or exited containers.
|
||||
- **Automated Recovery:** Restarts failed containers and resolves port binding issues.
|
||||
- **Run-once Setup Logic:** Ensures idempotent execution by controlling task flow with internal flags.
|
||||
- **System Role Integration:** Seamlessly integrates with Infinito.Nexus system maintenance logic.
|
||||
|
||||
## Credits 📝
|
||||
|
||||
Developed and maintained by **Kevin Veen-Birkenbach**.
|
||||
Learn more at [www.veen.world](https://www.veen.world)
|
||||
|
||||
Part of the [Infinito.Nexus Project](https://s.infinito.nexus/code)
|
||||
License: [Infinito.Nexus NonCommercial License](https://s.infinito.nexus/license)
|
5
roles/sys-ctl-rpr-docker-soft/handlers/main.yml
Normal file
5
roles/sys-ctl-rpr-docker-soft/handlers/main.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
- name: restart sys-ctl-rpr-docker-soft service
|
||||
systemd:
|
||||
name: sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
|
||||
state: restarted
|
||||
daemon_reload: yes
|
23
roles/sys-ctl-rpr-docker-soft/meta/main.yml
Normal file
23
roles/sys-ctl-rpr-docker-soft/meta/main.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
galaxy_info:
|
||||
author: "Kevin Veen-Birkenbach"
|
||||
description: "Automated recovery for unhealthy or exited Docker Compose containers."
|
||||
license: "Infinito.Nexus NonCommercial License"
|
||||
license_url: "https://s.infinito.nexus/license"
|
||||
company: |
|
||||
Kevin Veen-Birkenbach
|
||||
Consulting & Coaching Solutions
|
||||
https://www.veen.world
|
||||
min_ansible_version: "2.9"
|
||||
platforms:
|
||||
- name: Archlinux
|
||||
versions:
|
||||
- rolling
|
||||
galaxy_tags:
|
||||
- docker
|
||||
- docker-compose
|
||||
- systemd
|
||||
- automation
|
||||
- archlinux
|
||||
repository: https://s.infinito.nexus/code
|
||||
issue_tracker_url: https://s.infinito.nexus/issues
|
||||
documentation: "https://docs.infinito.nexus/"
|
32
roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml
Normal file
32
roles/sys-ctl-rpr-docker-soft/tasks/01_core.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
- name: Include dependency 'sys-lock'
|
||||
include_role:
|
||||
name: sys-lock
|
||||
when: run_once_sys_lock is not defined
|
||||
|
||||
- name: "create {{heal_docker}}"
|
||||
file:
|
||||
path: "{{heal_docker}}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: create sys-ctl-rpr-docker-soft.py
|
||||
template:
|
||||
src: sys-ctl-rpr-docker-soft.py.j2
|
||||
dest: "{{heal_docker}}sys-ctl-rpr-docker-soft.py"
|
||||
notify: restart sys-ctl-rpr-docker-soft service
|
||||
|
||||
- name: create sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
|
||||
template:
|
||||
src: sys-ctl-rpr-docker-soft.service.j2
|
||||
dest: /etc/systemd/system/sys-ctl-rpr-docker-soft{{ SYS_SERVICE_SUFFIX }}
|
||||
notify: restart sys-ctl-rpr-docker-soft service
|
||||
|
||||
- name: "set 'service_name' to '{{ role_name }}'"
|
||||
set_fact:
|
||||
service_name: "{{ role_name }}"
|
||||
|
||||
- name: "include role for sys-timer for {{ service_name }}"
|
||||
include_role:
|
||||
name: sys-timer
|
||||
vars:
|
||||
on_calendar: "{{SYS_SCHEDULE_REPAIR_DOCKER_SOFT}}"
|
4
roles/sys-ctl-rpr-docker-soft/tasks/main.yml
Normal file
4
roles/sys-ctl-rpr-docker-soft/tasks/main.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
- block:
|
||||
- include_tasks: 01_core.yml
|
||||
- include_tasks: utils/run_once.yml
|
||||
when: run_once_sys_ctl_rpr_docker_soft is not defined
|
@@ -0,0 +1,89 @@
|
||||
#!/bin/python
|
||||
#
|
||||
# Restart Docker-Compose configurations with exited or unhealthy containers
|
||||
#
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
import argparse
|
||||
|
||||
def bash(command):
|
||||
print(command)
|
||||
process = subprocess.Popen([command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
out, err = process.communicate()
|
||||
stdout = out.splitlines()
|
||||
stderr = err.decode("utf-8").strip() # decode stderr
|
||||
output = [line.decode("utf-8") for line in stdout]
|
||||
if process.returncode > 0:
|
||||
print(command, out, err)
|
||||
raise Exception(stderr) # pass the actual error text
|
||||
return output
|
||||
|
||||
def list_to_string(lst):
|
||||
return ' '.join(lst)
|
||||
|
||||
def print_bash(command):
|
||||
output = bash(command)
|
||||
print(list_to_string(output))
|
||||
return output
|
||||
|
||||
def find_docker_compose_file(directory):
|
||||
for root, _, files in os.walk(directory):
|
||||
if 'docker-compose.yml' in files:
|
||||
return os.path.join(root, 'docker-compose.yml')
|
||||
return None
|
||||
|
||||
def main(base_directory):
|
||||
errors = 0
|
||||
waiting_time = 600
|
||||
blocker_running = True
|
||||
|
||||
while blocker_running:
|
||||
try:
|
||||
bash("systemctl is-active --quiet sys-ctl-bkp-docker-2-loc{{ SYS_SERVICE_SUFFIX }}")
|
||||
bash("systemctl is-active --quiet update-docker{{ SYS_SERVICE_SUFFIX }}")
|
||||
print("Backup is running.")
|
||||
print(f"Trying again in {waiting_time} seconds.")
|
||||
time.sleep(waiting_time)
|
||||
except:
|
||||
blocker_running = False
|
||||
print("No blocking service is running.")
|
||||
|
||||
unhealthy_container_names = print_bash("docker ps --filter health=unhealthy --format '{% raw %}{{.Names}}{% endraw %}'")
|
||||
exited_container_names = print_bash("docker ps --filter status=exited --format '{% raw %}{{.Names}}{% endraw %}'")
|
||||
failed_containers = unhealthy_container_names + exited_container_names
|
||||
|
||||
unfiltered_failed_docker_compose_repositories = [container.split('-')[0] for container in failed_containers]
|
||||
filtered_failed_docker_compose_repositories = list(dict.fromkeys(unfiltered_failed_docker_compose_repositories))
|
||||
|
||||
for repo in filtered_failed_docker_compose_repositories:
|
||||
compose_file_path = find_docker_compose_file(os.path.join(base_directory, repo))
|
||||
|
||||
if compose_file_path:
|
||||
print("Restarting unhealthy container in:", compose_file_path)
|
||||
project_path = os.path.dirname(compose_file_path)
|
||||
try:
|
||||
print_bash(f'cd {project_path} && docker-compose -p "{repo}" restart')
|
||||
except Exception as e:
|
||||
if "port is already allocated" in str(e):
|
||||
print("Detected port allocation problem. Executing recovery steps...")
|
||||
print_bash(f'cd {project_path} && docker-compose down')
|
||||
print_bash('systemctl restart docker')
|
||||
print_bash(f'cd {project_path} && docker-compose -p "{repo}" up -d')
|
||||
else:
|
||||
print("Unhandled exception during restart:", e)
|
||||
errors += 1
|
||||
else:
|
||||
print("Error: Docker Compose file not found for:", repo)
|
||||
errors += 1
|
||||
|
||||
|
||||
print("Finished restart procedure.")
|
||||
exit(errors)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Restart Docker-Compose configurations with exited or unhealthy containers.")
|
||||
parser.add_argument("base_directory", type=str, help="Base directory where Docker Compose configurations are located.")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.base_directory)
|
@@ -0,0 +1,8 @@
|
||||
[Unit]
|
||||
Description=restart unhealthy docker containers
|
||||
OnFailure=sys-ctl-alm-compose.{{ SOFTWARE_NAME }}@%n.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStartPre=/bin/sh -c '/usr/bin/python {{ PATH_SYSTEM_LOCK_SCRIPT }} {{ SYS_SERVICE_GROUP_MANIPULATION | join(' ') }} --ignore {{SYS_SERVICE_GROUP_CLEANUP| join(' ') }} sys-ctl-rpr-docker-soft --timeout "{{SYS_TIMEOUT_HEAL_DOCKER}}"'
|
||||
ExecStart=/bin/sh -c '/bin/python {{heal_docker}}sys-ctl-rpr-docker-soft.py {{ PATH_DOCKER_COMPOSE_INSTANCES }}'
|
2
roles/sys-ctl-rpr-docker-soft/vars/main.yml
Normal file
2
roles/sys-ctl-rpr-docker-soft/vars/main.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
heal_docker: '{{ PATH_ADMINISTRATOR_SCRIPTS }}sys-ctl-rpr-docker-soft/'
|
||||
|
Reference in New Issue
Block a user