feat(filters): enforce safe Node.js heap sizing via reusable filter

- Add node_autosize filter (node_max_old_space_size) using get_app_conf
- Raise error when mem_limit < min_mb to prevent OOM-kill misconfigurations
- Wire Whiteboard NODE_OPTIONS and increase mem_limit to 1g; set cpus=1
- Refactor PeerTube to use the same filter; simplify vars
- Add unit tests; keep integration filters usage green

Context: https://chatgpt.com/share/690e0499-6a94-800f-b8ed-2c5124690103
This commit is contained in:
2025-11-07 15:39:54 +01:00
parent 493d5bbbda
commit 36f9573fdf
6 changed files with 231 additions and 31 deletions

View File

@@ -0,0 +1,141 @@
# filter_plugins/node_autosize.py
# Reuse app config to derive sensible Node.js heap sizes for containers.
#
# Usage example (Jinja):
# {{ applications | node_max_old_space_size('web-app-nextcloud', 'whiteboard') }}
#
# Heuristics (defaults):
# - candidate = 35% of mem_limit
# - min = 768 MB (required minimum)
# - cap = min(3072 MB, 60% of mem_limit)
#
# NEW: If mem_limit (container cgroup RAM) is smaller than min_mb, we raise an
# exception — to prevent a misconfiguration where Node's heap could exceed the cgroup
# and be OOM-killed.
from __future__ import annotations
import re
from ansible.errors import AnsibleFilterError
# Import the shared config resolver from module_utils
try:
from module_utils.config_utils import get_app_conf, AppConfigKeyError
except Exception as e:
raise AnsibleFilterError(
f"Failed to import get_app_conf from module_utils.config_utils: {e}"
)
_SIZE_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*([kmgtp]?i?b?)?\s*$", re.IGNORECASE)
_MULT = {
"": 1,
"b": 1,
"k": 10**3, "kb": 10**3,
"m": 10**6, "mb": 10**6,
"g": 10**9, "gb": 10**9,
"t": 10**12, "tb": 10**12,
"p": 10**15, "pb": 10**15,
"kib": 1024,
"mib": 1024**2,
"gib": 1024**3,
"tib": 1024**4,
"pib": 1024**5,
}
def _to_bytes(val):
"""Convert numeric or string memory limits (e.g. '512m', '2GiB') to bytes."""
if val is None or val == "":
return None
if isinstance(val, (int, float)):
return int(val)
if not isinstance(val, str):
raise AnsibleFilterError(f"Unsupported mem_limit type: {type(val).__name__}")
m = _SIZE_RE.match(val)
if not m:
raise AnsibleFilterError(f"Unrecognized mem_limit string: {val!r}")
num = float(m.group(1))
unit = (m.group(2) or "").lower()
if unit not in _MULT:
raise AnsibleFilterError(f"Unknown unit in mem_limit: {unit!r}")
return int(num * _MULT[unit])
def _mb(bytes_val: int) -> int:
"""Return decimal MB (10^6) as integer — Node expects MB units."""
return int(round(bytes_val / 10**6))
def _compute_old_space_mb(
total_mb: int, pct: float, min_mb: int, hardcap_mb: int, safety_cap_pct: float
) -> int:
"""
Compute Node.js old-space heap (MB) with safe minimum and cap handling.
NOTE: The calling function ensures total_mb >= min_mb; here we only
apply the sizing heuristics and caps.
"""
candidate = int(total_mb * float(pct))
safety_cap = int(total_mb * float(safety_cap_pct))
final_cap = min(int(hardcap_mb), safety_cap)
# Enforce minimum first; only apply cap if it's above the minimum
candidate = max(candidate, int(min_mb))
if final_cap >= int(min_mb):
candidate = min(candidate, final_cap)
# Never below a tiny hard floor
return max(candidate, 128)
def node_max_old_space_size(
applications: dict,
application_id: str,
service_name: str,
pct: float = 0.35,
min_mb: int = 768,
hardcap_mb: int = 3072,
safety_cap_pct: float = 0.60,
) -> int:
"""
Derive Node.js --max-old-space-size (MB) from the service's mem_limit in app config.
Looks up: docker.services.<service_name>.mem_limit for the given application_id.
Raises:
AnsibleFilterError if mem_limit is missing/invalid OR if mem_limit (MB) < min_mb.
"""
try:
mem_limit = get_app_conf(
applications=applications,
application_id=application_id,
config_path=f"docker.services.{service_name}.mem_limit",
strict=True,
default=None,
)
except AppConfigKeyError as e:
raise AnsibleFilterError(str(e))
if mem_limit in (None, False, ""):
raise AnsibleFilterError(
f"mem_limit not set for application '{application_id}', service '{service_name}'"
)
total_bytes = _to_bytes(mem_limit)
total_mb = _mb(total_bytes)
# NEW: guardrail — refuse to size a heap larger than the cgroup limit
if total_mb < int(min_mb):
raise AnsibleFilterError(
f"mem_limit ({total_mb} MB) is below the required minimum heap ({int(min_mb)} MB) "
f"for application '{application_id}', service '{service_name}'. "
f"Increase mem_limit or lower min_mb."
)
return _compute_old_space_mb(total_mb, pct, min_mb, hardcap_mb, safety_cap_pct)
class FilterModule(object):
def filters(self):
return {
"node_max_old_space_size": node_max_old_space_size,
}

View File

@@ -93,9 +93,9 @@ docker:
version: "latest"
backup:
no_stop_required: true
cpus: "0.25"
cpus: "1"
mem_reservation: "128m"
mem_limit: "512m"
mem_limit: "1g"
pids_limit: 1024
enabled: "{{ applications | get_app_conf('web-app-nextcloud', 'features.oidc', False, True, True) }}" # Activate OIDC for Nextcloud
# floavor decides which OICD plugin should be used.

View File

@@ -77,7 +77,8 @@
volumes:
- whiteboard_tmp:/tmp
- whiteboard_fontcache:/var/cache/fontconfig
environment:
- NODE_OPTIONS=--max-old-space-size={{ NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE }}
expose:
- "{{ container_port }}"
shm_size: 1g

View File

@@ -130,6 +130,7 @@ NEXTCLOUD_WHITEBOARD_TMP_VOLUME: "{{ applications | get_app_conf(applic
NEXTCLOUD_WHITEBOARD_FRONTCACHE_VOLUME: "{{ applications | get_app_conf(application_id, 'docker.volumes.whiteboard_fontcache') }}"
NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY: "{{ [ docker_compose.directories.services, 'whiteboard' ] | path_join }}"
NEXTCLOUD_WHITEBOARD_SERVICE_DOCKERFILE: "{{ [ NEXTCLOUD_WHITEBOARD_SERVICE_DIRECTORY, 'Dockerfile' ] | path_join }}"
NEXTCLOUD_WHITEBOARD_MAX_OLD_SPACE_SIZE: "{{ applications | node_max_old_space_size(application_id, NEXTCLOUD_WHITEBOARD_SERVICE) }}"
### Collabora
NEXTCLOUD_COLLABORA_URL: "{{ domains | get_url('web-svc-collabora', WEB_PROTOCOL) }}"

View File

@@ -1,6 +1,7 @@
# General
application_id: "web-app-peertube"
database_type: "postgres"
entity_name: "{{ application_id | get_entity_name }}"
# Docker
docker_compose_flush_handlers: true
@@ -16,32 +17,8 @@ PEERTUBE_CONFIG_VOLUME: "{{ applications | get_app_conf(application_id
PEERTUBE_OIDC_PLUGIN: "peertube-plugin-auth-openid-connect"
PEERTUBE_OIDC_ENABLED: "{{ applications | get_app_conf(application_id, 'features.oidc') }}"
# === Dynamic performance defaults ==========================================
# Raw Docker configuration values (with sane fallbacks)
peertube_cpus: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}"
peertube_mem_limit_raw: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.mem_limit') }}"
peertube_mem_bytes: "{{ peertube_mem_limit_raw | human_to_bytes }}"
peertube_mem_mb: "{{ ((peertube_mem_bytes | int) // (1024 * 1024)) | int }}"
# ---------------------------------------------------------------------------
# Node heap size:
# ~35% of total RAM, but at least 768 MB, at most 3072 MB,
# and never more than 60% of total memory (safety cap for small containers)
# ---------------------------------------------------------------------------
_peertube_heap_candidate_mb: "{{ ((peertube_mem_mb | float) * 0.35) | round(0, 'floor') | int }}"
_peertube_heap_cap_mb: "{{ ((peertube_mem_mb | float) * 0.60) | round(0, 'floor') | int }}"
# Step 1: enforce minimum (≥768 MB)
_peertube_heap_min_applied: "{{ [ (_peertube_heap_candidate_mb | int), 768 ] | max }}"
# Step 2: determine hard cap (min of 3072 MB and 60% of total memory)
_peertube_heap_hardcap: "{{ [ 3072, (_peertube_heap_cap_mb | int) ] | min }}"
# Step 3: final heap = min(min-applied, hardcap)
PEERTUBE_MAX_OLD_SPACE_SIZE: "{{ [ (_peertube_heap_min_applied | int), (_peertube_heap_hardcap | int) ] | min }}"
# Transcoding concurrency: half the vCPUs; min 1, max 8
_peertube_concurrency_candidate: "{{ ((peertube_cpus | float) * 0.5) | round(0, 'floor') | int }}"
# Performance
PEERTUBE_CPUS: "{{ applications | get_app_conf(application_id, 'docker.services.peertube.cpus') | float }}"
PEERTUBE_MAX_OLD_SPACE_SIZE: "{{ applications | node_max_old_space_size(application_id, entity_name) }}"
_peertube_concurrency_candidate: "{{ ((PEERTUBE_CPUS | float) * 0.5) | round(0, 'floor') | int }}"
PEERTUBE_TRANSCODING_CONCURRENCY: "{{ [ ( [ (_peertube_concurrency_candidate | int), 1 ] | max ), 8 ] | min }}"

View File

@@ -0,0 +1,80 @@
# tests/unit/filter_plugins/test_node_autosize.py
import unittest
from unittest.mock import patch
# Module under test
import filter_plugins.node_autosize as na
try:
from ansible.errors import AnsibleFilterError # type: ignore
except Exception:
AnsibleFilterError = Exception
class TestNodeAutosizeFilter(unittest.TestCase):
"""Unit tests for the node_autosize filter plugin."""
def setUp(self):
# Default parameters used by all tests
self.applications = {"web-app-nextcloud": {"docker": {"services": {"whiteboard": {}}}}}
self.application_id = "web-app-nextcloud"
self.service_name = "whiteboard"
# Patch get_app_conf (imported from module_utils.config_utils) inside the filter plugin
self.patcher = patch("filter_plugins.node_autosize.get_app_conf")
self.mock_get_app_conf = self.patcher.start()
def tearDown(self):
self.patcher.stop()
def _set_mem_limit(self, value):
"""Helper: mock get_app_conf to return a specific mem_limit value."""
def _fake_get_app_conf(applications, application_id, config_path, strict=True, default=None, **_kwargs):
assert application_id == self.application_id
assert config_path == f"docker.services.{self.service_name}.mem_limit"
return value
self.mock_get_app_conf.side_effect = _fake_get_app_conf
# --- Tests for node_max_old_space_size (MB) ---
def test_512m_below_minimum_raises(self):
# mem_limit=512 MB < min_mb=768 -> must raise
self._set_mem_limit("512m")
with self.assertRaises(AnsibleFilterError):
na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
def test_2g_caps_to_minimum_768(self):
self._set_mem_limit("2g")
mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
self.assertEqual(mb, 768) # 35% of 2g = 700 < 768 -> min wins
def test_8g_uses_35_percent_without_hitting_hardcap(self):
self._set_mem_limit("8g")
mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
self.assertEqual(mb, 2800) # 8g -> 8000 MB * 0.35 = 2800
def test_16g_hits_hardcap_3072(self):
self._set_mem_limit("16g")
mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
self.assertEqual(mb, 3072) # 35% of 16g = 5600, hardcap=3072
def test_numeric_bytes_input(self):
# 2 GiB in bytes (IEC): 2 * 1024 ** 3 = 2147483648
self._set_mem_limit(2147483648)
mb = na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
# 2 GiB ≈ 2147 MB; 35% => ~751, min 768 => 768
self.assertEqual(mb, 768)
def test_invalid_unit_raises_error(self):
self._set_mem_limit("12x") # invalid unit
with self.assertRaises(AnsibleFilterError):
na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
def test_missing_mem_limit_raises_error(self):
self._set_mem_limit(None)
with self.assertRaises(AnsibleFilterError):
na.node_max_old_space_size(self.applications, self.application_id, self.service_name)
if __name__ == "__main__":
unittest.main()