From d9766403121accb269c030570cfc80aa7256b460 Mon Sep 17 00:00:00 2001 From: Kevin Veen-Birkenbach Date: Sat, 10 Jan 2026 18:40:22 +0100 Subject: [PATCH] fix(seed): handle empty databases.csv and add unit tests - Gracefully handle empty databases.csv by creating header columns and emitting a warning - Add _empty_df() helper for consistent DataFrame initialization - Add unit tests for baudolo-seed including empty-file regression case - Apply minor formatting fixes across backup and e2e test files https://chatgpt.com/share/69628f0b-8744-800f-b08d-2633e05167da --- src/baudolo/backup/app.py | 1 + src/baudolo/backup/cli.py | 2 +- src/baudolo/backup/db.py | 8 +- src/baudolo/seed/__main__.py | 32 ++- .../test_e2e_dump_only_fallback_to_files.py | 30 ++- tests/e2e/test_e2e_dump_only_sql_mixed_run.py | 8 +- ...eed_star_and_db_entries_backup_postgres.py | 31 ++- tests/unit/src/baudolo/seed/test_main.py | 216 ++++++++++++++++++ 8 files changed, 289 insertions(+), 39 deletions(-) create mode 100644 tests/unit/src/baudolo/seed/test_main.py diff --git a/src/baudolo/backup/app.py b/src/baudolo/backup/app.py index ed9cd5e..8ffb296 100644 --- a/src/baudolo/backup/app.py +++ b/src/baudolo/backup/app.py @@ -72,6 +72,7 @@ def requires_stop(containers: list[str], images_no_stop_required: list[str]) -> return True return False + def backup_mariadb_or_postgres( *, container: str, diff --git a/src/baudolo/backup/cli.py b/src/baudolo/backup/cli.py index 7615a91..aff13c0 100644 --- a/src/baudolo/backup/cli.py +++ b/src/baudolo/backup/cli.py @@ -68,7 +68,7 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Do not restart containers after backup", ) - + p.add_argument( "--dump-only-sql", action="store_true", diff --git a/src/baudolo/backup/db.py b/src/baudolo/backup/db.py index c5da126..427c5ab 100644 --- a/src/baudolo/backup/db.py +++ b/src/baudolo/backup/db.py @@ -52,7 +52,9 @@ def _atomic_write_cmd(cmd: str, out_file: str) -> None: execute_shell_command(f"mv {tmp} {out_file}") -def fallback_pg_dumpall(container: str, username: str, password: str, out_file: str) -> None: +def fallback_pg_dumpall( + container: str, username: str, password: str, out_file: str +) -> None: """ Perform a full Postgres cluster dump using pg_dumpall. """ @@ -103,9 +105,7 @@ def backup_database( "'*' is currently only supported for Postgres." ) - cluster_file = os.path.join( - out_dir, f"{instance_name}.cluster.backup.sql" - ) + cluster_file = os.path.join(out_dir, f"{instance_name}.cluster.backup.sql") fallback_pg_dumpall(container, user, password, cluster_file) produced = True continue diff --git a/src/baudolo/seed/__main__.py b/src/baudolo/seed/__main__.py index 4ff9d41..69ead02 100644 --- a/src/baudolo/seed/__main__.py +++ b/src/baudolo/seed/__main__.py @@ -7,10 +7,11 @@ import re import sys import pandas as pd from typing import Optional - +from pandas.errors import EmptyDataError DB_NAME_RE = re.compile(r"^[a-zA-Z0-9_][a-zA-Z0-9_-]*$") + def _validate_database_value(value: Optional[str], *, instance: str) -> str: v = (value or "").strip() if v == "": @@ -31,6 +32,11 @@ def _validate_database_value(value: Optional[str], *, instance: str) -> str: ) return v + +def _empty_df() -> pd.DataFrame: + return pd.DataFrame(columns=["instance", "database", "username", "password"]) + + def check_and_add_entry( file_path: str, instance: str, @@ -48,17 +54,21 @@ def check_and_add_entry( database = _validate_database_value(database, instance=instance) if os.path.exists(file_path): - df = pd.read_csv( - file_path, - sep=";", - dtype=str, - keep_default_na=False, - ) + try: + df = pd.read_csv( + file_path, + sep=";", + dtype=str, + keep_default_na=False, + ) + except EmptyDataError: + print( + f"WARNING: databases.csv exists but is empty: {file_path}. Creating header columns.", + file=sys.stderr, + ) + df = _empty_df() else: - df = pd.DataFrame( - columns=["instance", "database", "username", "password"] - ) - + df = _empty_df() mask = (df["instance"] == instance) & (df["database"] == database) if mask.any(): diff --git a/tests/e2e/test_e2e_dump_only_fallback_to_files.py b/tests/e2e/test_e2e_dump_only_fallback_to_files.py index cdb2231..fa458c1 100644 --- a/tests/e2e/test_e2e_dump_only_fallback_to_files.py +++ b/tests/e2e/test_e2e_dump_only_fallback_to_files.py @@ -133,22 +133,28 @@ class TestE2EDumpOnlyFallbackToFiles(unittest.TestCase): ) def test_files_backup_exists_due_to_fallback(self) -> None: - p = backup_path( - self.backups_dir, - self.repo_name, - self.version, - self.pg_volume, - ) / "files" + p = ( + backup_path( + self.backups_dir, + self.repo_name, + self.version, + self.pg_volume, + ) + / "files" + ) self.assertTrue(p.is_dir(), f"Expected files backup dir at: {p}") def test_sql_dump_not_present(self) -> None: # There should be no sql dumps because databases.csv had no matching entry. - sql_dir = backup_path( - self.backups_dir, - self.repo_name, - self.version, - self.pg_volume, - ) / "sql" + sql_dir = ( + backup_path( + self.backups_dir, + self.repo_name, + self.version, + self.pg_volume, + ) + / "sql" + ) # Could exist (dir created) in some edge cases, but should contain no *.sql dumps. if sql_dir.exists(): dumps = list(sql_dir.glob("*.sql")) diff --git a/tests/e2e/test_e2e_dump_only_sql_mixed_run.py b/tests/e2e/test_e2e_dump_only_sql_mixed_run.py index acc7d58..ae94df8 100644 --- a/tests/e2e/test_e2e_dump_only_sql_mixed_run.py +++ b/tests/e2e/test_e2e_dump_only_sql_mixed_run.py @@ -96,10 +96,10 @@ class TestE2EDumpOnlySqlMixedRun(unittest.TestCase): "sh", "-lc", ( - f'psql -U postgres -d {cls.pg_db} -c ' + f"psql -U postgres -d {cls.pg_db} -c " '"CREATE TABLE IF NOT EXISTS t (id INT PRIMARY KEY, v TEXT);' "INSERT INTO t(id,v) VALUES (1,'hello-db') " - "ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;\"" + 'ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;"' ), ], check=True, @@ -143,7 +143,9 @@ class TestE2EDumpOnlySqlMixedRun(unittest.TestCase): cleanup_docker(containers=cls.containers, volumes=cls.volumes) def test_db_volume_has_dump_and_no_files_dir(self) -> None: - base = backup_path(self.backups_dir, self.repo_name, self.version, self.db_volume) + base = backup_path( + self.backups_dir, self.repo_name, self.version, self.db_volume + ) dumps = base / "sql" files = base / "files" diff --git a/tests/e2e/test_e2e_seed_star_and_db_entries_backup_postgres.py b/tests/e2e/test_e2e_seed_star_and_db_entries_backup_postgres.py index 27d7146..88a11d7 100644 --- a/tests/e2e/test_e2e_seed_star_and_db_entries_backup_postgres.py +++ b/tests/e2e/test_e2e_seed_star_and_db_entries_backup_postgres.py @@ -99,10 +99,10 @@ class TestE2ESeedStarAndDbEntriesBackupPostgres(unittest.TestCase): "sh", "-lc", ( - f'psql -U {cls.pg_user} -d {cls.pg_db1} -c ' + f"psql -U {cls.pg_user} -d {cls.pg_db1} -c " '"CREATE TABLE IF NOT EXISTS t (id INT PRIMARY KEY, v TEXT);' "INSERT INTO t(id,v) VALUES (1,'hello-db1') " - "ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;\"" + 'ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;"' ), ], check=True, @@ -115,10 +115,10 @@ class TestE2ESeedStarAndDbEntriesBackupPostgres(unittest.TestCase): "sh", "-lc", ( - f'psql -U {cls.pg_user} -d {cls.pg_db2} -c ' + f"psql -U {cls.pg_user} -d {cls.pg_db2} -c " '"CREATE TABLE IF NOT EXISTS t (id INT PRIMARY KEY, v TEXT);' "INSERT INTO t(id,v) VALUES (1,'hello-db2') " - "ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;\"" + 'ON CONFLICT (id) DO UPDATE SET v=EXCLUDED.v;"' ), ], check=True, @@ -132,7 +132,16 @@ class TestE2ESeedStarAndDbEntriesBackupPostgres(unittest.TestCase): instance = cls.pg_container # Seed star entry (pg_dumpall) - run(["baudolo-seed", cls.databases_csv, instance, "*", cls.pg_user, cls.pg_password]) + run( + [ + "baudolo-seed", + cls.databases_csv, + instance, + "*", + cls.pg_user, + cls.pg_password, + ] + ) # Seed concrete DB entry (pg_dump) run( @@ -177,7 +186,9 @@ class TestE2ESeedStarAndDbEntriesBackupPostgres(unittest.TestCase): cleanup_docker(containers=cls.containers, volumes=cls.volumes) def test_db_volume_has_cluster_dump_and_concrete_db_dump_and_no_files(self) -> None: - base = backup_path(self.backups_dir, self.repo_name, self.version, self.db_volume) + base = backup_path( + self.backups_dir, self.repo_name, self.version, self.db_volume + ) sql_dir = base / "sql" files_dir = base / "files" @@ -204,10 +215,14 @@ class TestE2ESeedStarAndDbEntriesBackupPostgres(unittest.TestCase): ) def test_non_db_volume_still_has_files_backup(self) -> None: - base = backup_path(self.backups_dir, self.repo_name, self.version, self.files_volume) + base = backup_path( + self.backups_dir, self.repo_name, self.version, self.files_volume + ) files_dir = base / "files" - self.assertTrue(files_dir.exists(), f"Expected files dir for non-DB volume at: {files_dir}") + self.assertTrue( + files_dir.exists(), f"Expected files dir for non-DB volume at: {files_dir}" + ) marker = files_dir / "hello.txt" self.assertTrue(marker.is_file(), f"Expected marker file at: {marker}") diff --git a/tests/unit/src/baudolo/seed/test_main.py b/tests/unit/src/baudolo/seed/test_main.py new file mode 100644 index 0000000..252293a --- /dev/null +++ b/tests/unit/src/baudolo/seed/test_main.py @@ -0,0 +1,216 @@ +# tests/unit/src/baudolo/seed/test_main.py +from __future__ import annotations + +import unittest +from unittest.mock import MagicMock, patch + +import pandas as pd +from pandas.errors import EmptyDataError + +import baudolo.seed.__main__ as seed_main + + +class TestSeedMain(unittest.TestCase): + @patch("baudolo.seed.__main__.pd.DataFrame") + def test_empty_df_creates_expected_columns(self, df_ctor: MagicMock) -> None: + seed_main._empty_df() + df_ctor.assert_called_once_with( + columns=["instance", "database", "username", "password"] + ) + + def test_validate_database_value_rejects_empty(self) -> None: + with self.assertRaises(ValueError): + seed_main._validate_database_value("", instance="x") + + def test_validate_database_value_accepts_star(self) -> None: + self.assertEqual(seed_main._validate_database_value("*", instance="x"), "*") + + def test_validate_database_value_rejects_nan(self) -> None: + with self.assertRaises(ValueError): + seed_main._validate_database_value("nan", instance="x") + + def test_validate_database_value_rejects_invalid_name(self) -> None: + with self.assertRaises(ValueError): + seed_main._validate_database_value("bad name", instance="x") + + @patch("baudolo.seed.__main__.os.path.exists", return_value=False) + @patch("baudolo.seed.__main__.pd.read_csv") + @patch("baudolo.seed.__main__._empty_df") + @patch("baudolo.seed.__main__.pd.concat") + def test_check_and_add_entry_file_missing_adds_entry( + self, + concat: MagicMock, + empty_df: MagicMock, + read_csv: MagicMock, + exists: MagicMock, + ) -> None: + df_existing = MagicMock(spec=pd.DataFrame) + series_mask = MagicMock() + series_mask.any.return_value = False + + df_existing.__getitem__.return_value = series_mask # for df["instance"] etc. + empty_df.return_value = df_existing + + df_out = MagicMock(spec=pd.DataFrame) + concat.return_value = df_out + + seed_main.check_and_add_entry( + file_path="/tmp/databases.csv", + instance="inst", + database="db", + username="user", + password="pass", + ) + + read_csv.assert_not_called() + empty_df.assert_called_once() + concat.assert_called_once() + df_out.to_csv.assert_called_once_with( + "/tmp/databases.csv", sep=";", index=False + ) + + @patch("baudolo.seed.__main__.os.path.exists", return_value=True) + @patch("baudolo.seed.__main__.pd.read_csv", side_effect=EmptyDataError("empty")) + @patch("baudolo.seed.__main__._empty_df") + @patch("baudolo.seed.__main__.pd.concat") + @patch("baudolo.seed.__main__.print") + def test_check_and_add_entry_empty_file_warns_and_creates_columns_and_adds( + self, + print_: MagicMock, + concat: MagicMock, + empty_df: MagicMock, + read_csv: MagicMock, + exists: MagicMock, + ) -> None: + """ + Key regression test: + If file exists but is empty => warn, create header columns, then proceed. + """ + df_existing = MagicMock(spec=pd.DataFrame) + series_mask = MagicMock() + series_mask.any.return_value = False + + # emulate df["instance"] and df["database"] usage + df_existing.__getitem__.return_value = series_mask + empty_df.return_value = df_existing + + df_out = MagicMock(spec=pd.DataFrame) + concat.return_value = df_out + + seed_main.check_and_add_entry( + file_path="/tmp/databases.csv", + instance="inst", + database="db", + username="user", + password="pass", + ) + + exists.assert_called_once_with("/tmp/databases.csv") + read_csv.assert_called_once() + empty_df.assert_called_once() + + # warning was printed to stderr + self.assertTrue(print_.called) + args, kwargs = print_.call_args + self.assertIn("WARNING: databases.csv exists but is empty", args[0]) + self.assertIn("file", kwargs) + self.assertEqual(kwargs["file"], seed_main.sys.stderr) + + concat.assert_called_once() + df_out.to_csv.assert_called_once_with( + "/tmp/databases.csv", sep=";", index=False + ) + + @patch("baudolo.seed.__main__.os.path.exists", return_value=True) + @patch("baudolo.seed.__main__.pd.read_csv") + def test_check_and_add_entry_updates_existing_row( + self, + read_csv: MagicMock, + exists: MagicMock, + ) -> None: + df = MagicMock(spec=pd.DataFrame) + + # mask.any() => True triggers update branch + mask = MagicMock() + mask.any.return_value = True + + # df["instance"] etc => return something that supports comparisons; + # simplest: just return an object that makes mask flow work. + df.__getitem__.return_value = MagicMock() + # Force the computed mask to be our mask + # by making (df["instance"] == instance) & (df["database"] == database) return `mask` + left = MagicMock() + right = MagicMock() + left.__and__.return_value = mask + df.__getitem__.return_value.__eq__.side_effect = [left, right] # two == calls + + read_csv.return_value = df + + seed_main.check_and_add_entry( + file_path="/tmp/databases.csv", + instance="inst", + database="db", + username="user", + password="pass", + ) + + # update branch: df.loc[mask, ["username","password"]] = ... + # we can't easily assert the assignment, but we can assert .loc was accessed + self.assertTrue(hasattr(df, "loc")) + df.to_csv.assert_called_once_with("/tmp/databases.csv", sep=";", index=False) + + @patch("baudolo.seed.__main__.check_and_add_entry") + @patch("baudolo.seed.__main__.argparse.ArgumentParser.parse_args") + def test_main_calls_check_and_add_entry( + self, parse_args: MagicMock, cae: MagicMock + ) -> None: + ns = MagicMock() + ns.file = "/tmp/databases.csv" + ns.instance = "inst" + ns.database = "db" + ns.username = "user" + ns.password = "pass" + parse_args.return_value = ns + + seed_main.main() + + cae.assert_called_once_with( + file_path="/tmp/databases.csv", + instance="inst", + database="db", + username="user", + password="pass", + ) + + @patch("baudolo.seed.__main__.sys.exit") + @patch("baudolo.seed.__main__.print") + @patch( + "baudolo.seed.__main__.check_and_add_entry", side_effect=RuntimeError("boom") + ) + @patch("baudolo.seed.__main__.argparse.ArgumentParser.parse_args") + def test_main_exits_nonzero_on_error( + self, + parse_args: MagicMock, + cae: MagicMock, + print_: MagicMock, + exit_: MagicMock, + ) -> None: + ns = MagicMock() + ns.file = "/tmp/databases.csv" + ns.instance = "inst" + ns.database = "db" + ns.username = "user" + ns.password = "pass" + parse_args.return_value = ns + + seed_main.main() + + # prints error to stderr and exits with 1 + self.assertTrue(print_.called) + _, kwargs = print_.call_args + self.assertEqual(kwargs.get("file"), seed_main.sys.stderr) + exit_.assert_called_once_with(1) + + +if __name__ == "__main__": + unittest.main()