From 838286c54ed392211b0118ec6a27bd6807fd4fce Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Tue, 6 Jan 2026 17:23:05 +0100 Subject: [PATCH] feat: make cleanup production-safe by separating invalid backups from infra errors - Delete only truly invalid backups (dirval rc=1) - Treat timeouts and missing dirval as infrastructure errors - Never auto-delete backups affected by timeouts - Return exit code 1 on infrastructure problems - Update unit and E2E tests to reflect new safety semantics - Align README with new deletion and exit-code behavior https://chatgpt.com/share/695d36f6-0000-800f-98e7-f88a798d6e91 --- README.md | 105 +++++++++++++++++++++++------------ src/cleanback/__main__.py | 54 +++++++++++++++--- tests/e2e/test_e2e_docker.py | 18 +++--- tests/unit/test_main.py | 14 ++--- 4 files changed, 131 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index 5b2a3a8..82d9ba3 100644 --- a/README.md +++ b/README.md @@ -7,21 +7,24 @@ **Repository:** https://github.com/kevinveenbirkenbach/cleanup-failed-backups -`cleanback` validates and (optionally) cleans up **failed Docker backup directories**. -It scans backup folders under a configurable backups root (e.g. `/Backups`), uses `dirval` to validate each subdirectory, and lets you delete the ones that fail validation. +`cleanback` validates and (optionally) cleans up **failed Docker backup directories** in a **production-safe** way. -Validation runs **in parallel** for performance; deletions are controlled and can be **interactive** or **fully automatic**. +It scans backup folders under a configurable backups root (for example `/Backups`), uses `dirval` to validate each backup subdirectory, and removes **only those backups that are confirmed to be invalid**. + +Validation runs **in parallel** for performance; deletions are **explicitly controlled** and can be interactive or fully automated. --- ## โœจ Highlights - **Parallel validation** of backup subdirectories -- Uses **`dirval`** (directory-validator) via CLI -- **Interactive** or **non-interactive** deletion flow (`--yes`) +- Uses **`dirval`** (directory validator) via CLI +- **Safe deletion model**: only truly invalid backups are removed +- **Interactive** or **non-interactive** cleanup (`--yes`) - Supports validating a single backup **ID** or **all** backups +- Clear **exit code semantics** for CI and system services - Clean **Python package** with `pyproject.toml` -- **Unit + Docker-based E2E tests** +- **Unit tests** and **Docker-based E2E tests** --- @@ -31,7 +34,7 @@ Validation runs **in parallel** for performance; deletions are controlled and ca ```bash pip install cleanback -```` +``` This installs: @@ -51,7 +54,7 @@ pip install -e . ## ๐Ÿ”ง Requirements * Python **3.8+** -* Access to the backups root directory tree (e.g. `/Backups`) +* Read/write access to the backups root directory tree (e.g. `/Backups`) * `dirval` (installed automatically via pip dependency) --- @@ -66,6 +69,8 @@ After installation, the command is: cleanback ``` +--- + ### Validate a single backup ID ```bash @@ -78,6 +83,8 @@ Validates directories under: /Backups//backup-docker-to-local/* ``` +--- + ### Validate all backups ```bash @@ -92,50 +99,78 @@ Scans: --- -### Common options +## โš™๏ธ Common options -| Option | Description | -| -------------------- | ------------------------------------------------------------------ | -| `--dirval-cmd ` | Path or name of `dirval` executable (default: `dirval`) | -| `--workers ` | Parallel workers (default: CPU count, min 2) | -| `--timeout ` | Per-directory validation timeout (float supported, default: 300.0) | -| `--yes` | Non-interactive mode: delete failures automatically | -| `--force-keep ` | In `--all` mode: skip the last *n* backup folders (default: 0) | +| Option | Description | +| -------------------- | ------------------------------------------------------------------------------------- | +| `--dirval-cmd ` | Path or name of `dirval` executable (default: `dirval`) | +| `--workers ` | Number of parallel validator workers (default: CPU count, minimum 2) | +| `--timeout ` | Per-directory validation timeout in seconds (float supported, default: `300.0`) | +| `--yes` | Non-interactive mode: automatically delete **invalid** backups (dirval rc=1 only) | +| `--force-keep ` | In `--all` mode: skip the last *n* timestamp subdirectories inside each backup folder | + +> **Note:** Backups affected by timeouts or infrastructure errors are **never deleted automatically**, even when `--yes` is used. --- -### Examples +## ๐Ÿงช Examples ```bash -# Validate a single backup and prompt on failures +# Validate a single backup and prompt before deleting invalid ones cleanback --backups-root /Backups --id 2024-09-01T12-00-00 - -# Validate everything with 8 workers and auto-delete failures -cleanback --backups-root /Backups --all --workers 8 --yes - -# Use a custom dirval binary and short timeout -cleanback --backups-root /Backups --all --dirval-cmd /usr/local/bin/dirval --timeout 5.0 ``` ---- - -## ๐Ÿงช Tests - -### Run all tests +```bash +# Validate all backups and automatically delete invalid ones +cleanback --backups-root /Backups --all --workers 8 --yes +``` ```bash -make test +# Use a custom dirval binary and a short timeout (testing only) +cleanback \ + --backups-root /Backups \ + --all \ + --dirval-cmd /usr/local/bin/dirval \ + --timeout 5.0 ``` --- ## ๐Ÿ”’ Safety & Design Notes -* **No host filesystem is modified** during tests - (E2E tests run exclusively inside Docker) -* Deletions are **explicitly confirmed** unless `--yes` is used -* Timeouts are treated as **validation failures** -* Validation and deletion phases are **clearly separated** +* **Validation and deletion are strictly separated** +* Only backups explicitly marked **invalid by `dirval`** are eligible for deletion +* **Timeouts and infrastructure errors are NOT treated as invalid backups** +* Backups affected by timeouts are **never deleted automatically** +* Infrastructure problems (timeouts, missing `dirval`) cause a **non-zero exit code** +* Deletions require confirmation unless `--yes` is specified +* Tests never touch the host filesystem (E2E tests run inside Docker only) + +This design makes `cleanback` safe for unattended operation on production systems. + +--- + +## ๐Ÿšฆ Exit codes + +`cleanback` uses exit codes to clearly distinguish between backup issues and infrastructure problems: + +| Exit code | Meaning | +| --------- | ------------------------------------------------------------------ | +| `0` | All backups valid, or invalid backups were successfully removed | +| `1` | Validation infrastructure problem (e.g. timeout, missing `dirval`) | +| `2` | CLI usage or configuration error | + +This makes the tool suitable for **CI pipelines**, **systemd services**, and other automation. + +--- + +## ๐Ÿงช Tests + +Run all tests (unit + Docker-based E2E): + +```bash +make test +``` --- diff --git a/src/cleanback/__main__.py b/src/cleanback/__main__.py index c2d1b88..b4085f7 100755 --- a/src/cleanback/__main__.py +++ b/src/cleanback/__main__.py @@ -277,6 +277,19 @@ def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace: return parser.parse_args(argv) +def _is_timeout(res: ValidationResult) -> bool: + return res.returncode == 124 or "timed out" in (res.stderr or "").lower() + + +def _is_dirval_missing(res: ValidationResult) -> bool: + return res.returncode == 127 or "not found" in (res.stderr or "").lower() + + +def _is_invalid(res: ValidationResult) -> bool: + # dirval: 0 = ok, 1 = invalid, others = infra errors (timeout/missing/etc.) + return res.returncode == 1 + + def main(argv: Optional[List[str]] = None) -> int: args = parse_args(argv) @@ -296,18 +309,43 @@ def main(argv: Optional[List[str]] = None) -> int: return 0 results = parallel_validate(subdirs, args.dirval_cmd, args.workers, args.timeout) - failures = [r for r in results if not r.ok] - if not failures: + invalids = [r for r in results if _is_invalid(r)] + timeouts = [r for r in results if _is_timeout(r)] + missing = [r for r in results if _is_dirval_missing(r)] + + deleted = 0 + if invalids: + print(f"\n{len(invalids)} directory(ies) are invalid (dirval rc=1).") + deleted = process_deletions(invalids, assume_yes=args.yes) + + ok_count = sum(1 for r in results if r.ok) + + if timeouts or missing: + print("\nERROR: validation infrastructure problem detected.") + if timeouts: + print(f"- timeouts: {len(timeouts)} (will NOT delete these)") + for r in timeouts[:10]: + print(f" timeout: {r.subdir}") + if len(timeouts) > 10: + print(f" ... (+{len(timeouts) - 10} more)") + if missing: + print(f"- dirval missing: {len(missing)} (will NOT delete these)") + for r in missing[:10]: + print(f" missing: {r.subdir}") + if len(missing) > 10: + print(f" ... (+{len(missing) - 10} more)") + + print( + f"\nSummary: deleted={deleted}, invalid={len(invalids)}, ok={ok_count}, timeouts={len(timeouts)}, missing={len(missing)}" + ) + return 1 + + if not invalids: print("\nAll directories validated successfully. No action required.") return 0 - print(f"\n{len(failures)} directory(ies) failed validation.") - deleted = process_deletions(failures, assume_yes=args.yes) - kept = len(failures) - deleted - print( - f"\nSummary: deleted={deleted}, kept={kept}, ok={len(results) - len(failures)}" - ) + print(f"\nSummary: deleted={deleted}, invalid={len(invalids)}, ok={ok_count}") return 0 diff --git a/tests/e2e/test_e2e_docker.py b/tests/e2e/test_e2e_docker.py index 6d99343..c6a06a3 100644 --- a/tests/e2e/test_e2e_docker.py +++ b/tests/e2e/test_e2e_docker.py @@ -121,13 +121,6 @@ class CleanbackE2EDockerTests(unittest.TestCase): env["PATH"] = f"{self.bin_dir}:{env.get('PATH', '')}" # Run: python -m cleanback --id --yes - # We must point BACKUPS_ROOT to our run_root. Easiest: set /Backups = run_root - # But code currently has BACKUPS_ROOT = /Backups constant. - # - # Therefore, we create our test tree under /Backups (done above) and pass --id - # relative to that structure by using run_root/. To do that, we make - # run_root the direct child under /Backups, then we pass the composite id: - # "/". composite_id = f"{self.run_root.name}/{self.backup_id}" cmd = [ @@ -148,14 +141,19 @@ class CleanbackE2EDockerTests(unittest.TestCase): ] proc = subprocess.run(cmd, text=True, capture_output=True, env=env) - self.assertEqual(proc.returncode, 0, msg=proc.stderr or proc.stdout) + # New behavior: + # - invalid dirs are deleted and do NOT cause failure + # - timeouts are treated as infrastructure problems -> exit code 1 and NOT deleted + self.assertEqual(proc.returncode, 1, msg=proc.stderr or proc.stdout) + self.assertTrue(self.good.exists(), "good should remain") self.assertFalse(self.bad.exists(), "bad should be deleted") - self.assertFalse( + self.assertTrue( self.timeout.exists(), - "timeout should be deleted (timeout treated as failure)", + "timeout should NOT be deleted (timeouts are infrastructure problems)", ) self.assertIn("Summary:", proc.stdout) + self.assertIn("validation infrastructure problem", proc.stdout.lower()) if __name__ == "__main__": diff --git a/tests/unit/test_main.py b/tests/unit/test_main.py index ceaf9c4..464c014 100644 --- a/tests/unit/test_main.py +++ b/tests/unit/test_main.py @@ -123,12 +123,12 @@ class CleanupBackupsUsingDirvalTests(unittest.TestCase): "--yes", ] ) - self.assertEqual(rc, 0, msg=err or out) + self.assertEqual(rc, 1, msg=err or out) self.assertTrue(self.goodA.exists(), "goodA should remain") self.assertFalse(self.badB.exists(), "badB should be deleted") - self.assertFalse( + self.assertTrue( self.timeoutC.exists(), - "timeoutC should be deleted (timeout treated as failure)", + "timeoutC should NOT be deleted (timeout is infra error)", ) self.assertIn("Summary:", out) @@ -147,10 +147,10 @@ class CleanupBackupsUsingDirvalTests(unittest.TestCase): "--yes", ] ) - self.assertEqual(rc, 0, msg=err or out) + self.assertEqual(rc, 1, msg=err or out) self.assertTrue(self.goodA.exists()) self.assertFalse(self.badB.exists()) - self.assertFalse(self.timeoutC.exists()) + self.assertTrue(self.timeoutC.exists()) self.assertTrue(self.goodX.exists()) self.assertFalse(self.badY.exists()) @@ -198,8 +198,8 @@ class CleanupBackupsUsingDirvalTests(unittest.TestCase): "--yes", ] ) - self.assertEqual(rc, 0, msg=err or out) - self.assertIn("dirval not found", out + err) + self.assertEqual(rc, 1, msg=err or out) + self.assertIn("dirval missing", out + err) def test_no_targets_message(self): empty = self.backups_root / "EMPTY" / "backup-docker-to-local"