feat(filters): enforce safe Node.js heap sizing via reusable filter

- Add node_autosize filter (node_max_old_space_size) using get_app_conf - Raise error when mem_limit < min_mb to prevent OOM-kill misconfigurations - Wire Whiteboard NODE_OPTIONS and increase mem_limit to 1g; set cpus=1 - Refactor PeerTube to use the same filter; simplify vars - Add unit tests; keep integration filters usage green Context: https://chatgpt.com/share/690e0499-6a94-800f-b8ed-2c5124690103
2025-11-10 15:16:31 +00:00 · 2025-11-07 15:39:54 +01:00
parent 493d5bbbda
commit 36f9573fdf
6 changed files with 231 additions and 31 deletions
--- a/filter_plugins/node_autosize.py
+++ b/filter_plugins/node_autosize.py
@@ -0,0 +1,141 @@
+# filter_plugins/node_autosize.py
+# Reuse app config to derive sensible Node.js heap sizes for containers.
+#
+# Usage example (Jinja):
+#   {{ applications | node_max_old_space_size('web-app-nextcloud', 'whiteboard') }}
+#
+# Heuristics (defaults):
+#   - candidate = 35% of mem_limit
+#   - min       = 768 MB (required minimum)
+#   - cap       = min(3072 MB, 60% of mem_limit)
+#
+# NEW: If mem_limit (container cgroup RAM) is smaller than min_mb, we raise an
+# exception — to prevent a misconfiguration where Node's heap could exceed the cgroup
+# and be OOM-killed.
+
+from __future__ import annotations
+import re
+from ansible.errors import AnsibleFilterError
+
+# Import the shared config resolver from module_utils
+try:
+    from module_utils.config_utils import get_app_conf, AppConfigKeyError
+except Exception as e:
+    raise AnsibleFilterError(
+        f"Failed to import get_app_conf from module_utils.config_utils: {e}"
+    )
+
+_SIZE_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*([kmgtp]?i?b?)?\s*$", re.IGNORECASE)
+_MULT = {
+    "": 1,
+    "b": 1,
+    "k": 10**3, "kb": 10**3,
+    "m": 10**6, "mb": 10**6,
+    "g": 10**9, "gb": 10**9,
+    "t": 10**12, "tb": 10**12,
+    "p": 10**15, "pb": 10**15,
+    "kib": 1024,
+    "mib": 1024**2,
+    "gib": 1024**3,
+    "tib": 1024**4,
+    "pib": 1024**5,
+}
+
+
+def _to_bytes(val):
+    """Convert numeric or string memory limits (e.g. '512m', '2GiB') to bytes."""
+    if val is None or val == "":
+        return None
+    if isinstance(val, (int, float)):
+        return int(val)
+    if not isinstance(val, str):
+        raise AnsibleFilterError(f"Unsupported mem_limit type: {type(val).__name__}")
+    m = _SIZE_RE.match(val)
+    if not m:
+        raise AnsibleFilterError(f"Unrecognized mem_limit string: {val!r}")
+    num = float(m.group(1))
+    unit = (m.group(2) or "").lower()
+    if unit not in _MULT:
+        raise AnsibleFilterError(f"Unknown unit in mem_limit: {unit!r}")
+    return int(num * _MULT[unit])
+
+
+def _mb(bytes_val: int) -> int:
+    """Return decimal MB (10^6) as integer — Node expects MB units."""
+    return int(round(bytes_val / 10**6))
+
+
+def _compute_old_space_mb(
+    total_mb: int, pct: float, min_mb: int, hardcap_mb: int, safety_cap_pct: float
+) -> int:
+    """
+    Compute Node.js old-space heap (MB) with safe minimum and cap handling.
+
+    NOTE: The calling function ensures total_mb >= min_mb; here we only
+    apply the sizing heuristics and caps.
+    """
+    candidate = int(total_mb * float(pct))
+    safety_cap = int(total_mb * float(safety_cap_pct))
+    final_cap = min(int(hardcap_mb), safety_cap)
+
+    # Enforce minimum first; only apply cap if it's above the minimum
+    candidate = max(candidate, int(min_mb))
+    if final_cap >= int(min_mb):
+        candidate = min(candidate, final_cap)
+
+    # Never below a tiny hard floor
+    return max(candidate, 128)
+
+
+def node_max_old_space_size(
+    applications: dict,
+    application_id: str,
+    service_name: str,
+    pct: float = 0.35,
+    min_mb: int = 768,
+    hardcap_mb: int = 3072,
+    safety_cap_pct: float = 0.60,
+) -> int:
+    """
+    Derive Node.js --max-old-space-size (MB) from the service's mem_limit in app config.
+
+    Looks up: docker.services.<service_name>.mem_limit for the given application_id.
+
+    Raises:
+        AnsibleFilterError if mem_limit is missing/invalid OR if mem_limit (MB) < min_mb.
+    """
+    try:
+        mem_limit = get_app_conf(
+            applications=applications,
+            application_id=application_id,
+            config_path=f"docker.services.{service_name}.mem_limit",
+            strict=True,
+            default=None,
+        )
+    except AppConfigKeyError as e:
+        raise AnsibleFilterError(str(e))
+
+    if mem_limit in (None, False, ""):
+        raise AnsibleFilterError(
+            f"mem_limit not set for application '{application_id}', service '{service_name}'"
+        )
+
+    total_bytes = _to_bytes(mem_limit)
+    total_mb = _mb(total_bytes)
+
+    # NEW: guardrail — refuse to size a heap larger than the cgroup limit
+    if total_mb < int(min_mb):
+        raise AnsibleFilterError(
+            f"mem_limit ({total_mb} MB) is below the required minimum heap ({int(min_mb)} MB) "
+            f"for application '{application_id}', service '{service_name}'. "
+            f"Increase mem_limit or lower min_mb."
+        )
+
+    return _compute_old_space_mb(total_mb, pct, min_mb, hardcap_mb, safety_cap_pct)
+
+
+class FilterModule(object):
+    def filters(self):
+        return {
+            "node_max_old_space_size": node_max_old_space_size,
+        }
--- a/roles/web-app-nextcloud/config/main.yml
+++ b/roles/web-app-nextcloud/config/main.yml
@@ -93,9 +93,9 @@ docker:
      version:              "latest"
      backup:
        no_stop_required:   true
-      cpus:                 "0.25"
+      cpus:                 "1"
      mem_reservation:      "128m"
-      mem_limit:            "512m"
+      mem_limit:            "1g"
      pids_limit:           1024
  enabled:  "{{ applications | get_app_conf('web-app-nextcloud', 'features.oidc', False, True, True) }}"   # Activate OIDC for Nextcloud
  # floavor decides which OICD plugin should be used. 
--- a/roles/web-app-nextcloud/templates/docker-compose.yml.j2
+++ b/roles/web-app-nextcloud/templates/docker-compose.yml.j2
@@ -77,7 +77,8 @@
    volumes:
      - whiteboard_tmp:/tmp
      - whiteboard_fontcache:/var/cache/fontconfig
-
+    environment:
+      - NODE_OPTIONS=--max-old-space-size={{ NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE }}
    expose:
      - "{{ container_port }}"
    shm_size: 1g
--- a/roles/web-app-nextcloud/vars/main.yml
+++ b/roles/web-app-nextcloud/vars/main.yml
@@ -130,6 +130,7 @@ NEXTCLOUD_WHITEBOARD_TMP_VOLUME:          "{{ applications | get_app_conf(applic
 NEXTCLOUD_WHITEBOARD_FRONTCACHE_VOLUME:   "{{ applications | get_app_conf(application_id, 'docker.volumes.whiteboard_fontcache') }}"
 NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY:   "{{ [ docker_compose.directories.services, 'whiteboard' ] | path_join }}"
 NEXTCLOUD_WHITEBOARD_SERVICE_DOCKERFILE:  "{{ [ NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY, 'Dockerfile' ] | path_join }}"
+NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE:  "{{ applications | node_max_old_space_size(application_id, NEXTCLOUD_WHITEBOARD_SERVICE) }}"

 ### Collabora
 NEXTCLOUD_COLLABORA_URL:                  "{{ domains | get_url('web-svc-collabora', WEB_PROTOCOL) }}"
--- a/roles/web-app-peertube/vars/main.yml
+++ b/roles/web-app-peertube/vars/main.yml
@@ -1,6 +1,7 @@
 # General
 application_id:                   "web-app-peertube"
 database_type:                    "postgres"
+entity_name:                      "{{ application_id | get_entity_name }}"

 # Docker
 docker_compose_flush_handlers:    true
@@ -16,32 +17,8 @@ PEERTUBE_CONFIG_VOLUME:           "{{ applications | get_app_conf(application_id
 PEERTUBE_OIDC_PLUGIN:             "peertube-plugin-auth-openid-connect"
 PEERTUBE_OIDC_ENABLED:            "{{ applications | get_app_conf(application_id, 'features.oidc') }}"

-# === Dynamic performance defaults ==========================================
-
-# Raw Docker configuration values (with sane fallbacks)
-peertube_cpus:          "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}"
-peertube_mem_limit_raw: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.mem_limit') }}"
-peertube_mem_bytes:     "{{ peertube_mem_limit_raw | human_to_bytes }}"
-peertube_mem_mb:        "{{ ((peertube_mem_bytes | int) // (1024 * 1024)) | int }}"
-
-# ---------------------------------------------------------------------------
-# Node heap size:
-# ~35% of total RAM, but at least 768 MB, at most 3072 MB,
-# and never more than 60% of total memory (safety cap for small containers)
-# ---------------------------------------------------------------------------
-
-_peertube_heap_candidate_mb: "{{ ((peertube_mem_mb | float) * 0.35) | round(0, 'floor') | int }}"
-_peertube_heap_cap_mb:       "{{ ((peertube_mem_mb | float) * 0.60) | round(0, 'floor') | int }}"
-
-# Step 1: enforce minimum (≥768 MB)
-_peertube_heap_min_applied:  "{{ [ (_peertube_heap_candidate_mb | int), 768 ] | max }}"
-
-# Step 2: determine hard cap (min of 3072 MB and 60% of total memory)
-_peertube_heap_hardcap:      "{{ [ 3072, (_peertube_heap_cap_mb | int) ] | min }}"
-
-# Step 3: final heap = min(min-applied, hardcap)
-PEERTUBE_MAX_OLD_SPACE_SIZE: "{{ [ (_peertube_heap_min_applied | int), (_peertube_heap_hardcap | int) ] | min }}"
-
-# Transcoding concurrency: half the vCPUs; min 1, max 8
-_peertube_concurrency_candidate:  "{{ ((peertube_cpus | float) * 0.5) | round(0, 'floor') | int }}"
+# Performance
+PEERTUBE_CPUS:                    "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}"
+PEERTUBE_MAX_OLD_SPACE_SIZE:      "{{ applications | node_max_old_space_size(application_id, entity_name) }}"
+_peertube_concurrency_candidate:  "{{ ((PEERTUBE_CPUS | float) * 0.5) | round(0, 'floor') | int }}"
 PEERTUBE_TRANSCODING_CONCURRENCY: "{{ [ ( [ (_peertube_concurrency_candidate | int), 1 ] | max ), 8 ] | min }}"
--- a/tests/unit/filter_plugins/test_node_autosize.py
+++ b/tests/unit/filter_plugins/test_node_autosize.py
@@ -0,0 +1,80 @@
+# tests/unit/filter_plugins/test_node_autosize.py
+import unittest
+from unittest.mock import patch
+
+# Module under test
+import filter_plugins.node_autosize as na
+
+try:
+    from ansible.errors import AnsibleFilterError  # type: ignore
+except Exception:
+    AnsibleFilterError = Exception
+
+
+class TestNodeAutosizeFilter(unittest.TestCase):
+    """Unit tests for the node_autosize filter plugin."""
+
+    def setUp(self):
+        # Default parameters used by all tests
+        self.applications = {"web-app-nextcloud": {"docker": {"services": {"whiteboard": {}}}}}
+        self.application_id = "web-app-nextcloud"
+        self.service_name = "whiteboard"
+
+        # Patch get_app_conf (imported from module_utils.config_utils) inside the filter plugin
+        self.patcher = patch("filter_plugins.node_autosize.get_app_conf")
+        self.mock_get_app_conf = self.patcher.start()
+
+    def tearDown(self):
+        self.patcher.stop()
+
+    def _set_mem_limit(self, value):
+        """Helper: mock get_app_conf to return a specific mem_limit value."""
+        def _fake_get_app_conf(applications, application_id, config_path, strict=True, default=None, **_kwargs):
+            assert application_id == self.application_id
+            assert config_path == f"docker.services.{self.service_name}.mem_limit"
+            return value
+        self.mock_get_app_conf.side_effect = _fake_get_app_conf
+
+    # --- Tests for node_max_old_space_size (MB) ---
+
+    def test_512m_below_minimum_raises(self):
+        # mem_limit=512 MB < min_mb=768 -> must raise
+        self._set_mem_limit("512m")
+        with self.assertRaises(AnsibleFilterError):
+            na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+
+    def test_2g_caps_to_minimum_768(self):
+        self._set_mem_limit("2g")
+        mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+        self.assertEqual(mb, 768)  # 35% of 2g = 700 < 768 -> min wins
+
+    def test_8g_uses_35_percent_without_hitting_hardcap(self):
+        self._set_mem_limit("8g")
+        mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+        self.assertEqual(mb, 2800)  # 8g -> 8000 MB * 0.35 = 2800
+
+    def test_16g_hits_hardcap_3072(self):
+        self._set_mem_limit("16g")
+        mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+        self.assertEqual(mb, 3072)  # 35% of 16g = 5600, hardcap=3072
+
+    def test_numeric_bytes_input(self):
+        # 2 GiB in bytes (IEC): 2 * 1024 ** 3 = 2147483648
+        self._set_mem_limit(2147483648)
+        mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+        # 2 GiB ≈ 2147 MB; 35% => ~751, min 768 => 768
+        self.assertEqual(mb, 768)
+
+    def test_invalid_unit_raises_error(self):
+        self._set_mem_limit("12x")  # invalid unit
+        with self.assertRaises(AnsibleFilterError):
+            na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+
+    def test_missing_mem_limit_raises_error(self):
+        self._set_mem_limit(None)
+        with self.assertRaises(AnsibleFilterError):
+            na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
+
+
+if __name__ == "__main__":
+    unittest.main()