mirror of
https://github.com/kevinveenbirkenbach/computer-playbook.git
synced 2025-08-30 15:28:12 +02:00
Refactor systemctl services and categories due to alarm bugs
This commit restructures systemctl service definitions and category mappings. Motivation: Alarm-related bugs revealed inconsistencies in service and role handling. Preparation step: lays the groundwork for fixing the alarm issues by aligning categories, roles, and service templates.
This commit is contained in:
26
roles/sys-ctl-hlth-docker-container/README.md
Normal file
26
roles/sys-ctl-hlth-docker-container/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Docker Container Health Check
|
||||
|
||||
## Description
|
||||
|
||||
This role monitors the health status of Docker containers on the system. It detects containers that are either **unhealthy** or have **exited with a non-zero code**, and triggers alerts if issues are found.
|
||||
|
||||
## Overview
|
||||
|
||||
The role installs a health check script along with a `systemd` service and timer to run these checks at scheduled intervals.
|
||||
If unhealthy or failed containers are detected, the configured failure notifier (via `sys-ctl-alm-compose`) is triggered.
|
||||
|
||||
## Purpose
|
||||
|
||||
The primary purpose of this role is to ensure that Docker-based services remain operational. By automatically monitoring container health, it enables administrators to react quickly to failures, reducing downtime and preventing unnoticed service degradation.
|
||||
|
||||
## Features
|
||||
|
||||
- **Automated Health Checks:** Detects containers in `unhealthy` state or exited with non-zero exit codes.
|
||||
- **Systemd Integration:** Installs a one-shot service and timer to run health checks on a schedule.
|
||||
- **Alerting Support:** Works with the [`sys-ctl-alm-compose`](../sys-ctl-alm-compose/README.md) role for failure notifications.
|
||||
- **Configurable Script Location:** Controlled via the `PATH_ADMINISTRATOR_SCRIPTS` variable.
|
||||
|
||||
## Further Resources
|
||||
|
||||
- [Docker Health Checks Documentation](https://docs.docker.com/engine/reference/run/#healthcheck)
|
||||
- [Systemd Timers Documentation](https://www.freedesktop.org/software/systemd/man/systemd.timer.html)
|
@@ -0,0 +1,30 @@
|
||||
#!/bin/sh
|
||||
docker_ps_grep_unhealthy="$(docker ps --filter health=unhealthy --format '{{.Names}}')"
|
||||
docker_ps_grep_exited="$(docker ps --filter status=exited --format '{{.ID}}')"
|
||||
|
||||
exitcode=0
|
||||
|
||||
if [ -n "$docker_ps_grep_unhealthy" ]; then
|
||||
echo "Some docker containers are unhealthy: $docker_ps_grep_unhealthy"
|
||||
exitcode=1
|
||||
fi
|
||||
|
||||
if [ -n "$docker_ps_grep_exited" ]; then
|
||||
for container_id in $docker_ps_grep_exited
|
||||
do
|
||||
container_exit_code="$(docker inspect "$container_id" --format='{{.State.ExitCode}}')"
|
||||
container_name="$(docker inspect "$container_id" --format='{{.Name}}')"
|
||||
container_name="${container_name#/}" # Entfernt das führende '/'
|
||||
if [ "$container_exit_code" -ne "0" ]; then
|
||||
echo "Container $container_name exited with code $container_exit_code"
|
||||
exitcode=2
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [ "$exitcode" -ne "0" ]; then
|
||||
exit $exitcode
|
||||
fi
|
||||
|
||||
echo "All docker containers are healthy."
|
||||
exit
|
5
roles/sys-ctl-hlth-docker-container/handlers/main.yml
Normal file
5
roles/sys-ctl-hlth-docker-container/handlers/main.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
- name: "reload sys-ctl-hlth-docker-container service"
|
||||
systemd:
|
||||
name: sys-ctl-hlth-docker-container{{ SYS_SERVICE_SUFFIX }}
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
21
roles/sys-ctl-hlth-docker-container/meta/main.yml
Normal file
21
roles/sys-ctl-hlth-docker-container/meta/main.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
galaxy_info:
|
||||
author: "Kevin Veen-Birkenbach"
|
||||
description: "Checks Docker containers for unhealthy or exited states and alerts on any issues."
|
||||
company: |
|
||||
Kevin Veen-Birkenbach
|
||||
Consulting & Coaching Solutions
|
||||
https://www.veen.world
|
||||
license: "Infinito.Nexus NonCommercial License"
|
||||
license_url: "https://s.infinito.nexus/license"
|
||||
min_ansible_version: "2.9"
|
||||
platforms:
|
||||
- name: Archlinux
|
||||
versions: ["rolling"]
|
||||
galaxy_tags:
|
||||
- monitor
|
||||
- docker
|
||||
- containers
|
||||
- health
|
||||
- systemd
|
||||
repository: "https://s.infinito.nexus/code"
|
||||
documentation: "https://docs.infinito.nexus"
|
31
roles/sys-ctl-hlth-docker-container/tasks/01_core.yml
Normal file
31
roles/sys-ctl-hlth-docker-container/tasks/01_core.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
- name: Include dependency 'sys-ctl-alm-compose'
|
||||
include_role:
|
||||
name: sys-ctl-alm-compose
|
||||
when: run_once_sys_ctl_alm_compose is not defined
|
||||
|
||||
- name: "create {{health_docker_container_folder}}"
|
||||
file:
|
||||
path: "{{health_docker_container_folder}}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
|
||||
- name: create sys-ctl-hlth-docker-container.sh
|
||||
copy:
|
||||
src: sys-ctl-hlth-docker-container.sh
|
||||
dest: "{{health_docker_container_folder}}sys-ctl-hlth-docker-container.sh"
|
||||
|
||||
- name: create sys-ctl-hlth-docker-container{{ SYS_SERVICE_SUFFIX }}
|
||||
template:
|
||||
src: sys-ctl-hlth-docker-container.service.j2
|
||||
dest: /etc/systemd/system/sys-ctl-hlth-docker-container{{ SYS_SERVICE_SUFFIX }}
|
||||
notify: reload sys-ctl-hlth-docker-container service
|
||||
|
||||
- name: "set 'service_name' to '{{ role_name }}'"
|
||||
set_fact:
|
||||
service_name: "{{ role_name }}"
|
||||
|
||||
- name: "include role for sys-timer for {{ service_name }}"
|
||||
include_role:
|
||||
name: sys-timer
|
||||
vars:
|
||||
on_calendar: "{{SYS_SCHEDULE_HEALTH_DOCKER_CONTAINER}}"
|
4
roles/sys-ctl-hlth-docker-container/tasks/main.yml
Normal file
4
roles/sys-ctl-hlth-docker-container/tasks/main.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
- block:
|
||||
- include_tasks: 01_core.yml
|
||||
- include_tasks: utils/run_once.yml
|
||||
when: run_once_sys_ctl_hlth_docker_container is not defined
|
@@ -0,0 +1,7 @@
|
||||
[Unit]
|
||||
Description=Checking docker health
|
||||
OnFailure=sys-ctl-alm-compose.{{ SOFTWARE_NAME }}@%n.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/bash {{health_docker_container_folder}}sys-ctl-hlth-docker-container.sh
|
1
roles/sys-ctl-hlth-docker-container/vars/main.yml
Normal file
1
roles/sys-ctl-hlth-docker-container/vars/main.yml
Normal file
@@ -0,0 +1 @@
|
||||
health_docker_container_folder: '{{ PATH_ADMINISTRATOR_SCRIPTS }}sys-ctl-hlth-docker-container/'
|
Reference in New Issue
Block a user