From 36f9573fdfead47b789ce7c32a83176f7b946855 Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Fri, 7 Nov 2025 15:39:54 +0100 Subject: [PATCH] feat(filters): enforce safe Node.js heap sizing via reusable filter - Add node_autosize filter (node_max_old_space_size) using get_app_conf - Raise error when mem_limit < min_mb to prevent OOM-kill misconfigurations - Wire Whiteboard NODE_OPTIONS and increase mem_limit to 1g; set cpus=1 - Refactor PeerTube to use the same filter; simplify vars - Add unit tests; keep integration filters usage green Context: https://chatgpt.com/share/690e0499-6a94-800f-b8ed-2c5124690103 --- filter_plugins/node_autosize.py | 141 ++++++++++++++++++ roles/web-app-nextcloud/config/main.yml | 4 +- .../templates/docker-compose.yml.j2 | 3 +- roles/web-app-nextcloud/vars/main.yml | 1 + roles/web-app-peertube/vars/main.yml | 33 +--- .../unit/filter_plugins/test_node_autosize.py | 80 ++++++++++ 6 files changed, 231 insertions(+), 31 deletions(-) create mode 100644 filter_plugins/node_autosize.py create mode 100644 tests/unit/filter_plugins/test_node_autosize.py diff --git a/filter_plugins/node_autosize.py b/filter_plugins/node_autosize.py new file mode 100644 index 00000000..424385e2 --- /dev/null +++ b/filter_plugins/node_autosize.py @@ -0,0 +1,141 @@ +# filter_plugins/node_autosize.py +# Reuse app config to derive sensible Node.js heap sizes for containers. +# +# Usage example (Jinja): +# {{ applications | node_max_old_space_size('web-app-nextcloud', 'whiteboard') }} +# +# Heuristics (defaults): +# - candidate = 35% of mem_limit +# - min = 768 MB (required minimum) +# - cap = min(3072 MB, 60% of mem_limit) +# +# NEW: If mem_limit (container cgroup RAM) is smaller than min_mb, we raise an +# exception — to prevent a misconfiguration where Node's heap could exceed the cgroup +# and be OOM-killed. + +from __future__ import annotations +import re +from ansible.errors import AnsibleFilterError + +# Import the shared config resolver from module_utils +try: + from module_utils.config_utils import get_app_conf, AppConfigKeyError +except Exception as e: + raise AnsibleFilterError( + f"Failed to import get_app_conf from module_utils.config_utils: {e}" + ) + +_SIZE_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*([kmgtp]?i?b?)?\s*$", re.IGNORECASE) +_MULT = { + "": 1, + "b": 1, + "k": 10**3, "kb": 10**3, + "m": 10**6, "mb": 10**6, + "g": 10**9, "gb": 10**9, + "t": 10**12, "tb": 10**12, + "p": 10**15, "pb": 10**15, + "kib": 1024, + "mib": 1024**2, + "gib": 1024**3, + "tib": 1024**4, + "pib": 1024**5, +} + + +def _to_bytes(val): + """Convert numeric or string memory limits (e.g. '512m', '2GiB') to bytes.""" + if val is None or val == "": + return None + if isinstance(val, (int, float)): + return int(val) + if not isinstance(val, str): + raise AnsibleFilterError(f"Unsupported mem_limit type: {type(val).__name__}") + m = _SIZE_RE.match(val) + if not m: + raise AnsibleFilterError(f"Unrecognized mem_limit string: {val!r}") + num = float(m.group(1)) + unit = (m.group(2) or "").lower() + if unit not in _MULT: + raise AnsibleFilterError(f"Unknown unit in mem_limit: {unit!r}") + return int(num * _MULT[unit]) + + +def _mb(bytes_val: int) -> int: + """Return decimal MB (10^6) as integer — Node expects MB units.""" + return int(round(bytes_val / 10**6)) + + +def _compute_old_space_mb( + total_mb: int, pct: float, min_mb: int, hardcap_mb: int, safety_cap_pct: float +) -> int: + """ + Compute Node.js old-space heap (MB) with safe minimum and cap handling. + + NOTE: The calling function ensures total_mb >= min_mb; here we only + apply the sizing heuristics and caps. + """ + candidate = int(total_mb * float(pct)) + safety_cap = int(total_mb * float(safety_cap_pct)) + final_cap = min(int(hardcap_mb), safety_cap) + + # Enforce minimum first; only apply cap if it's above the minimum + candidate = max(candidate, int(min_mb)) + if final_cap >= int(min_mb): + candidate = min(candidate, final_cap) + + # Never below a tiny hard floor + return max(candidate, 128) + + +def node_max_old_space_size( + applications: dict, + application_id: str, + service_name: str, + pct: float = 0.35, + min_mb: int = 768, + hardcap_mb: int = 3072, + safety_cap_pct: float = 0.60, +) -> int: + """ + Derive Node.js --max-old-space-size (MB) from the service's mem_limit in app config. + + Looks up: docker.services..mem_limit for the given application_id. + + Raises: + AnsibleFilterError if mem_limit is missing/invalid OR if mem_limit (MB) < min_mb. + """ + try: + mem_limit = get_app_conf( + applications=applications, + application_id=application_id, + config_path=f"docker.services.{service_name}.mem_limit", + strict=True, + default=None, + ) + except AppConfigKeyError as e: + raise AnsibleFilterError(str(e)) + + if mem_limit in (None, False, ""): + raise AnsibleFilterError( + f"mem_limit not set for application '{application_id}', service '{service_name}'" + ) + + total_bytes = _to_bytes(mem_limit) + total_mb = _mb(total_bytes) + + # NEW: guardrail — refuse to size a heap larger than the cgroup limit + if total_mb < int(min_mb): + raise AnsibleFilterError( + f"mem_limit ({total_mb} MB) is below the required minimum heap ({int(min_mb)} MB) " + f"for application '{application_id}', service '{service_name}'. " + f"Increase mem_limit or lower min_mb." + ) + + return _compute_old_space_mb(total_mb, pct, min_mb, hardcap_mb, safety_cap_pct) + + +class FilterModule(object): + def filters(self): + return { + "node_max_old_space_size": node_max_old_space_size, + } diff --git a/roles/web-app-nextcloud/config/main.yml b/roles/web-app-nextcloud/config/main.yml index b2998fcf..9cf1d2f4 100644 --- a/roles/web-app-nextcloud/config/main.yml +++ b/roles/web-app-nextcloud/config/main.yml @@ -93,9 +93,9 @@ docker: version: "latest" backup: no_stop_required: true - cpus: "0.25" + cpus: "1" mem_reservation: "128m" - mem_limit: "512m" + mem_limit: "1g" pids_limit: 1024 enabled: "{{ applications | get_app_conf('web-app-nextcloud', 'features.oidc', False, True, True) }}" # Activate OIDC for Nextcloud # floavor decides which OICD plugin should be used. diff --git a/roles/web-app-nextcloud/templates/docker-compose.yml.j2 b/roles/web-app-nextcloud/templates/docker-compose.yml.j2 index 18065d72..ebbe0e30 100644 --- a/roles/web-app-nextcloud/templates/docker-compose.yml.j2 +++ b/roles/web-app-nextcloud/templates/docker-compose.yml.j2 @@ -77,7 +77,8 @@ volumes: - whiteboard_tmp:/tmp - whiteboard_fontcache:/var/cache/fontconfig - + environment: + - NODE_OPTIONS=--max-old-space-size={{ NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE }} expose: - "{{ container_port }}" shm_size: 1g diff --git a/roles/web-app-nextcloud/vars/main.yml b/roles/web-app-nextcloud/vars/main.yml index 58c5a928..aad29fac 100644 --- a/roles/web-app-nextcloud/vars/main.yml +++ b/roles/web-app-nextcloud/vars/main.yml @@ -130,6 +130,7 @@ NEXTCLOUD_WHITEBOARD_TMP_VOLUME: "{{ applications | get_app_conf(applic NEXTCLOUD_WHITEBOARD_FRONTCACHE_VOLUME: "{{ applications | get_app_conf(application_id, 'docker.volumes.whiteboard_fontcache') }}" NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY: "{{ [ docker_compose.directories.services, 'whiteboard' ] | path_join }}" NEXTCLOUD_WHITEBOARD_SERVICE_DOCKERFILE: "{{ [ NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY, 'Dockerfile' ] | path_join }}" +NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE: "{{ applications | node_max_old_space_size(application_id, NEXTCLOUD_WHITEBOARD_SERVICE) }}" ### Collabora NEXTCLOUD_COLLABORA_URL: "{{ domains | get_url('web-svc-collabora', WEB_PROTOCOL) }}" diff --git a/roles/web-app-peertube/vars/main.yml b/roles/web-app-peertube/vars/main.yml index d56e5c40..e0270699 100644 --- a/roles/web-app-peertube/vars/main.yml +++ b/roles/web-app-peertube/vars/main.yml @@ -1,6 +1,7 @@ # General application_id: "web-app-peertube" database_type: "postgres" +entity_name: "{{ application_id | get_entity_name }}" # Docker docker_compose_flush_handlers: true @@ -16,32 +17,8 @@ PEERTUBE_CONFIG_VOLUME: "{{ applications | get_app_conf(application_id PEERTUBE_OIDC_PLUGIN: "peertube-plugin-auth-openid-connect" PEERTUBE_OIDC_ENABLED: "{{ applications | get_app_conf(application_id, 'features.oidc') }}" -# === Dynamic performance defaults ========================================== - -# Raw Docker configuration values (with sane fallbacks) -peertube_cpus: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}" -peertube_mem_limit_raw: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.mem_limit') }}" -peertube_mem_bytes: "{{ peertube_mem_limit_raw | human_to_bytes }}" -peertube_mem_mb: "{{ ((peertube_mem_bytes | int) // (1024 * 1024)) | int }}" - -# --------------------------------------------------------------------------- -# Node heap size: -# ~35% of total RAM, but at least 768 MB, at most 3072 MB, -# and never more than 60% of total memory (safety cap for small containers) -# --------------------------------------------------------------------------- - -_peertube_heap_candidate_mb: "{{ ((peertube_mem_mb | float) * 0.35) | round(0, 'floor') | int }}" -_peertube_heap_cap_mb: "{{ ((peertube_mem_mb | float) * 0.60) | round(0, 'floor') | int }}" - -# Step 1: enforce minimum (≥768 MB) -_peertube_heap_min_applied: "{{ [ (_peertube_heap_candidate_mb | int), 768 ] | max }}" - -# Step 2: determine hard cap (min of 3072 MB and 60% of total memory) -_peertube_heap_hardcap: "{{ [ 3072, (_peertube_heap_cap_mb | int) ] | min }}" - -# Step 3: final heap = min(min-applied, hardcap) -PEERTUBE_MAX_OLD_SPACE_SIZE: "{{ [ (_peertube_heap_min_applied | int), (_peertube_heap_hardcap | int) ] | min }}" - -# Transcoding concurrency: half the vCPUs; min 1, max 8 -_peertube_concurrency_candidate: "{{ ((peertube_cpus | float) * 0.5) | round(0, 'floor') | int }}" +# Performance +PEERTUBE_CPUS: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}" +PEERTUBE_MAX_OLD_SPACE_SIZE: "{{ applications | node_max_old_space_size(application_id, entity_name) }}" +_peertube_concurrency_candidate: "{{ ((PEERTUBE_CPUS | float) * 0.5) | round(0, 'floor') | int }}" PEERTUBE_TRANSCODING_CONCURRENCY: "{{ [ ( [ (_peertube_concurrency_candidate | int), 1 ] | max ), 8 ] | min }}" diff --git a/tests/unit/filter_plugins/test_node_autosize.py b/tests/unit/filter_plugins/test_node_autosize.py new file mode 100644 index 00000000..dca9bd27 --- /dev/null +++ b/tests/unit/filter_plugins/test_node_autosize.py @@ -0,0 +1,80 @@ +# tests/unit/filter_plugins/test_node_autosize.py +import unittest +from unittest.mock import patch + +# Module under test +import filter_plugins.node_autosize as na + +try: + from ansible.errors import AnsibleFilterError # type: ignore +except Exception: + AnsibleFilterError = Exception + + +class TestNodeAutosizeFilter(unittest.TestCase): + """Unit tests for the node_autosize filter plugin.""" + + def setUp(self): + # Default parameters used by all tests + self.applications = {"web-app-nextcloud": {"docker": {"services": {"whiteboard": {}}}}} + self.application_id = "web-app-nextcloud" + self.service_name = "whiteboard" + + # Patch get_app_conf (imported from module_utils.config_utils) inside the filter plugin + self.patcher = patch("filter_plugins.node_autosize.get_app_conf") + self.mock_get_app_conf = self.patcher.start() + + def tearDown(self): + self.patcher.stop() + + def _set_mem_limit(self, value): + """Helper: mock get_app_conf to return a specific mem_limit value.""" + def _fake_get_app_conf(applications, application_id, config_path, strict=True, default=None, **_kwargs): + assert application_id == self.application_id + assert config_path == f"docker.services.{self.service_name}.mem_limit" + return value + self.mock_get_app_conf.side_effect = _fake_get_app_conf + + # --- Tests for node_max_old_space_size (MB) --- + + def test_512m_below_minimum_raises(self): + # mem_limit=512 MB < min_mb=768 -> must raise + self._set_mem_limit("512m") + with self.assertRaises(AnsibleFilterError): + na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + + def test_2g_caps_to_minimum_768(self): + self._set_mem_limit("2g") + mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + self.assertEqual(mb, 768) # 35% of 2g = 700 < 768 -> min wins + + def test_8g_uses_35_percent_without_hitting_hardcap(self): + self._set_mem_limit("8g") + mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + self.assertEqual(mb, 2800) # 8g -> 8000 MB * 0.35 = 2800 + + def test_16g_hits_hardcap_3072(self): + self._set_mem_limit("16g") + mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + self.assertEqual(mb, 3072) # 35% of 16g = 5600, hardcap=3072 + + def test_numeric_bytes_input(self): + # 2 GiB in bytes (IEC): 2 * 1024 ** 3 = 2147483648 + self._set_mem_limit(2147483648) + mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + # 2 GiB ≈ 2147 MB; 35% => ~751, min 768 => 768 + self.assertEqual(mb, 768) + + def test_invalid_unit_raises_error(self): + self._set_mem_limit("12x") # invalid unit + with self.assertRaises(AnsibleFilterError): + na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + + def test_missing_mem_limit_raises_error(self): + self._set_mem_limit(None) + with self.assertRaises(AnsibleFilterError): + na.node_max_old_space_size(self.applications, self.application_id, self.service_name) + + +if __name__ == "__main__": + unittest.main()