From 4149580d57bc8f037643f2949c924da3ca05d486 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 01:30:57 +0530 Subject: [PATCH 01/80] Add WAL for direct deployment state recovery Signed-off-by: Varun Deep Saini --- .../wal/corrupted-wal-entry/databricks.yml | 25 ++ .../wal/corrupted-wal-entry/out.test.toml | 5 + .../deploy/wal/corrupted-wal-entry/output.txt | 56 +++ .../deploy/wal/corrupted-wal-entry/script | 35 ++ .../wal/corrupted-wal-entry/sort_warnings.py | 87 ++++ .../deploy/wal/corrupted-wal-entry/test.py | 1 + .../deploy/wal/corrupted-wal-entry/test.toml | 13 + .../wal/crash-after-create/databricks.yml | 15 + .../wal/crash-after-create/out.test.toml | 5 + .../deploy/wal/crash-after-create/output.txt | 38 ++ .../deploy/wal/crash-after-create/script | 24 + .../deploy/wal/crash-after-create/test.py | 1 + .../deploy/wal/crash-after-create/test.toml | 10 + .../deploy/wal/empty-wal/databricks.yml | 15 + .../bundle/deploy/wal/empty-wal/out.test.toml | 5 + .../bundle/deploy/wal/empty-wal/output.txt | 37 ++ acceptance/bundle/deploy/wal/empty-wal/script | 21 + .../bundle/deploy/wal/empty-wal/test.py | 1 + .../bundle/deploy/wal/empty-wal/test.toml | 13 + .../wal/future-serial-wal/databricks.yml | 15 + .../wal/future-serial-wal/out.test.toml | 5 + .../deploy/wal/future-serial-wal/output.txt | 29 ++ .../deploy/wal/future-serial-wal/script | 28 ++ .../deploy/wal/future-serial-wal/test.py | 1 + .../deploy/wal/future-serial-wal/test.toml | 4 + .../wal/lineage-mismatch/databricks.yml | 15 + .../deploy/wal/lineage-mismatch/out.test.toml | 5 + .../deploy/wal/lineage-mismatch/output.txt | 29 ++ .../bundle/deploy/wal/lineage-mismatch/script | 28 ++ .../deploy/wal/lineage-mismatch/test.py | 1 + .../deploy/wal/lineage-mismatch/test.toml | 4 + .../wal/multiple-crashes/databricks.yml | 15 + .../deploy/wal/multiple-crashes/out.test.toml | 5 + .../deploy/wal/multiple-crashes/output.txt | 64 +++ .../bundle/deploy/wal/multiple-crashes/script | 32 ++ .../deploy/wal/multiple-crashes/test.py | 1 + .../deploy/wal/multiple-crashes/test.toml | 10 + .../deploy/wal/normal-deploy/databricks.yml | 15 + .../deploy/wal/normal-deploy/out.test.toml | 5 + .../deploy/wal/normal-deploy/output.txt | 32 ++ .../bundle/deploy/wal/normal-deploy/script | 12 + .../bundle/deploy/wal/normal-deploy/test.py | 1 + .../bundle/deploy/wal/normal-deploy/test.toml | 9 + .../deploy/wal/stale-wal/databricks.yml | 15 + .../bundle/deploy/wal/stale-wal/out.test.toml | 5 + .../bundle/deploy/wal/stale-wal/output.txt | 38 ++ acceptance/bundle/deploy/wal/stale-wal/script | 40 ++ .../bundle/deploy/wal/stale-wal/test.py | 1 + .../bundle/deploy/wal/stale-wal/test.toml | 9 + .../wal/summary-after-crash/databricks.yml | 15 + .../wal/summary-after-crash/out.test.toml | 5 + .../deploy/wal/summary-after-crash/output.txt | 25 ++ .../deploy/wal/summary-after-crash/script | 11 + .../deploy/wal/summary-after-crash/test.py | 1 + .../deploy/wal/summary-after-crash/test.toml | 2 + acceptance/bundle/deploy/wal/test.toml | 43 ++ .../deploy/wal/wal-with-delete/databricks.yml | 15 + .../deploy/wal/wal-with-delete/out.test.toml | 5 + .../deploy/wal/wal-with-delete/output.txt | 21 + .../bundle/deploy/wal/wal-with-delete/script | 48 ++ .../bundle/deploy/wal/wal-with-delete/test.py | 1 + .../deploy/wal/wal-with-delete/test.toml | 5 + bundle/direct/bind.go | 6 +- bundle/direct/bundle_apply.go | 7 +- bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 121 ++++- bundle/direct/dstate/wal.go | 218 +++++++++ bundle/direct/dstate/wal_test.go | 419 ++++++++++++++++++ cmd/bundle/utils/process.go | 2 +- wal.txt | 205 +++++++++ 70 files changed, 2038 insertions(+), 19 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/script create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/output.txt create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/script create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.py create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/empty-wal/script create mode 100644 acceptance/bundle/deploy/wal/empty-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/empty-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/script create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/output.txt create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/script create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.py create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/output.txt create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/script create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.py create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.toml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/output.txt create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/script create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.py create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.toml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/stale-wal/script create mode 100644 acceptance/bundle/deploy/wal/stale-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/stale-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/output.txt create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/script create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.py create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.toml create mode 100644 acceptance/bundle/deploy/wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/output.txt create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/script create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.py create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.toml create mode 100644 bundle/direct/dstate/wal.go create mode 100644 bundle/direct/dstate/wal_test.go create mode 100644 wal.txt diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml new file mode 100644 index 00000000000..cc9024fadab --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml @@ -0,0 +1,25 @@ +bundle: + name: wal-corrupted-test + +resources: + jobs: + valid_job: + name: "valid-job" + tasks: + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + another_valid: + name: "another-valid" + tasks: + - task_key: "task-b" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt new file mode 100644 index 00000000000..11926293327 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -0,0 +1,56 @@ +=== Creating state file with serial 5 === +=== Creating WAL with corrupted entry === +=== WAL content === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} +not valid json - this line should be skipped +{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} +=== Deploy (should recover valid entries, skip corrupted) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.another_valid.tasks[0].new_cluster + in databricks.yml:23:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Warning: Single node cluster is not correctly configured + at resources.jobs.valid_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (should have recovered entries) === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.another_valid", + "resources.jobs.valid_job" + ] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script new file mode 100644 index 00000000000..d73595a6f4c --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -0,0 +1,35 @@ +echo "=== Creating state file with serial 5 ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 5, + "state": {} +} +EOF + +echo "=== Creating WAL with corrupted entry ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} +not valid json - this line should be skipped +{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should recover valid entries, skip corrupted) ===" +trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py + +echo "=== Final state (should have recovered entries) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py new file mode 100644 index 00000000000..06a6a0e59cc --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Sort warning blocks in CLI output to make test output deterministic. + +Warning blocks look like: +Warning: Single node cluster is not correctly configured + at resources.jobs.XXX.tasks[0].new_cluster + in databricks.yml:NN:NN + +num_workers should be 0 only for single-node clusters... + spark_conf: + ... + custom_tags: + ... + +This script groups consecutive warning blocks, sorts them by job name, and outputs. +""" + +import re +import sys + + +def main(): + content = sys.stdin.read() + lines = content.split("\n") + + result = [] + i = 0 + + while i < len(lines): + line = lines[i] + + # Check if this is the start of a warning block + if line.startswith("Warning:"): + # Collect all consecutive warning blocks + warnings = [] + while i < len(lines) and ( + lines[i].startswith("Warning:") + or ( + warnings + and not lines[i].startswith("Uploading") + and not lines[i].startswith("Deploying") + and not lines[i].startswith(">>>") + and not lines[i].startswith("===") + ) + ): + # Collect one complete warning block + block = [] + if lines[i].startswith("Warning:"): + block.append(lines[i]) + i += 1 + # Collect until next Warning or end marker + while i < len(lines): + if lines[i].startswith("Warning:"): + break + if lines[i].startswith("Uploading") or lines[i].startswith("Deploying"): + break + if lines[i].startswith(">>>") or lines[i].startswith("==="): + break + block.append(lines[i]) + i += 1 + warnings.append(block) + else: + i += 1 + + # Sort warnings by the job name in "at resources.jobs.XXX" + def get_sort_key(block): + for line in block: + match = re.search(r"at resources\.jobs\.(\w+)", line) + if match: + return match.group(1) + return "" + + warnings.sort(key=get_sort_key) + + # Output sorted warnings + for block in warnings: + for line in block: + result.append(line) + else: + result.append(line) + i += 1 + + print("\n".join(result), end="") + + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml new file mode 100644 index 00000000000..5bbe82835c6 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -0,0 +1,13 @@ +# WAL with corrupted entry - valid entries should be recovered, corrupted skipped. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=1111" +Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=2222" +Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml new file mode 100644 index 00000000000..ebee1d9699f --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-crash-test + +resources: + jobs: + job_a: + name: "test-job-a" + tasks: + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt new file mode 100644 index 00000000000..9c333263829 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -0,0 +1,38 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== WAL content before deploy === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"name":"test-job-a"}}} +=== Deploy (should recover from WAL) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.job_a.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== State file after recovery === +{ + "lineage": "test-lineage-123", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.job_a" + ] +} +=== WAL file after successful deploy === +WAL file deleted (expected) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script new file mode 100644 index 00000000000..c583a5eead9 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -0,0 +1,24 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":1} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"name":"test-job-a"}}} +EOF + +echo "=== WAL content before deploy ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should recover from WAL) ===" +trace $CLI bundle deploy + +echo "=== State file after recovery ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL file after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "WAL file deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.py b/acceptance/bundle/deploy/wal/crash-after-create/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml new file mode 100644 index 00000000000..9e20bac15dc --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -0,0 +1,10 @@ +# WAL recovery after simulated crash. Job was created but state wasn't finalized. +# Deploy should recover job from WAL and update it. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job-a"}}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml new file mode 100644 index 00000000000..147a1e1482f --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-empty-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt new file mode 100644 index 00000000000..91a31fe3222 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -0,0 +1,37 @@ +=== Creating state directory === +=== Creating empty WAL file === +=== Empty WAL file exists === +[FILE_INFO] .databricks/bundle/default/resources.json.wal +=== Deploy (should handle empty WAL gracefully) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +Empty WAL deleted (expected) +=== State file content === +{ + "lineage": "[UUID]", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script new file mode 100644 index 00000000000..f693753ac77 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -0,0 +1,21 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating empty WAL file ===" +touch .databricks/bundle/default/resources.json.wal + +echo "=== Empty WAL file exists ===" +ls -la .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should handle empty WAL gracefully) ===" +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "Empty WAL deleted (expected)" +fi + +echo "=== State file content ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.py b/acceptance/bundle/deploy/wal/empty-wal/test.py new file mode 100644 index 00000000000..11b15b1a458 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/test.py @@ -0,0 +1 @@ +print("hello") diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml new file mode 100644 index 00000000000..b97264c2bec --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -0,0 +1,13 @@ +# Empty WAL file should be deleted and deploy should proceed normally. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + +[[Repls]] +Old = '-rw[^ ]+ \d+ [^ ]+ [^ ]+ \d+ [A-Z][a-z]+ \d+ \d+:\d+' +New = '[FILE_INFO]' diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml new file mode 100644 index 00000000000..67079aaef86 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-future-serial-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt new file mode 100644 index 00000000000..ffb03147dc7 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -0,0 +1,29 @@ +=== Creating state file (serial=2) === +=== Creating WAL with future serial (serial=5, expected=3) === +=== WAL content === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Deploy (should fail with corruption error) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted + + +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/script b/acceptance/bundle/deploy/wal/future-serial-wal/script new file mode 100644 index 00000000000..7b1784b0c69 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/script @@ -0,0 +1,28 @@ +echo "=== Creating state file (serial=2) ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with future serial (serial=5, expected=3) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":5} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should fail with corruption error) ===" +trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.py b/acceptance/bundle/deploy/wal/future-serial-wal/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml new file mode 100644 index 00000000000..424fe2f1275 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml @@ -0,0 +1,4 @@ +# WAL with serial ahead of state - indicates corruption, should error. +# State has serial=2, WAL has serial=5 (expected would be 3). + +# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml new file mode 100644 index 00000000000..014ec7f8860 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-lineage-mismatch-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt new file mode 100644 index 00000000000..2419e7a6129 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -0,0 +1,29 @@ +=== Creating state file with lineage-A === +=== Creating WAL with lineage-B (mismatch) === +=== WAL content === +{"lineage":"wal-lineage-bbb","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Deploy (should fail with lineage mismatch error) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) + + +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script new file mode 100644 index 00000000000..b241246e6c9 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -0,0 +1,28 @@ +echo "=== Creating state file with lineage-A ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "state-lineage-aaa", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with lineage-B (mismatch) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"wal-lineage-bbb","serial":2} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should fail with lineage mismatch error) ===" +trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.py b/acceptance/bundle/deploy/wal/lineage-mismatch/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml new file mode 100644 index 00000000000..509cc82f095 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml @@ -0,0 +1,4 @@ +# WAL with different lineage than state - should error. +# State has lineage "state-lineage-aaa", WAL has lineage "wal-lineage-bbb". + +# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml new file mode 100644 index 00000000000..b4162d8fdf3 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-multi-crash-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt new file mode 100644 index 00000000000..3e0426a628c --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -0,0 +1,64 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== WAL content === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== First deploy attempt (will crash during update) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] +=== WAL after first crash === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Second deploy attempt (should succeed) === + +>>> [CLI] bundle deploy --force-lock +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script new file mode 100644 index 00000000000..795e4261e19 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/script @@ -0,0 +1,32 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-456","serial":1} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== First deploy attempt (will crash during update) ===" +trace errcode $CLI bundle deploy + +echo "=== WAL after first crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + cat .databricks/bundle/default/resources.json.wal +fi + +echo "=== Second deploy attempt (should succeed) ===" +trace $CLI bundle deploy --force-lock + +echo "=== Final state ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.py b/acceptance/bundle/deploy/wal/multiple-crashes/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml new file mode 100644 index 00000000000..2e9973c8464 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -0,0 +1,10 @@ +# Multiple crashes during recovery - WAL should persist until successful finalize. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +KillCaller = 1 +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml new file mode 100644 index 00000000000..413705d40cb --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt new file mode 100644 index 00000000000..50c1430641f --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -0,0 +1,32 @@ + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +WAL file deleted after successful deploy (expected) +=== State file content === +{ + "lineage": "[UUID]", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/normal-deploy/script b/acceptance/bundle/deploy/wal/normal-deploy/script new file mode 100644 index 00000000000..5acc4d9b589 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/script @@ -0,0 +1,12 @@ +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected - should be deleted after Finalize)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL file deleted after successful deploy (expected)" +fi + +echo "=== State file content ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.py b/acceptance/bundle/deploy/wal/normal-deploy/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.toml b/acceptance/bundle/deploy/wal/normal-deploy/test.toml new file mode 100644 index 00000000000..1299046974a --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/test.toml @@ -0,0 +1,9 @@ +# WAL is created during deploy, used for state tracking, and deleted after Finalize. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml new file mode 100644 index 00000000000..6b24f6fd269 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-stale-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt new file mode 100644 index 00000000000..3722788e52d --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -0,0 +1,38 @@ +=== Creating state directory === +=== Creating state file (serial=2) === +=== Creating stale WAL with old serial (serial=1) === +=== WAL content before deploy === +{"lineage":"stale-test-lineage","serial": [SERIAL]} +{"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} +=== Deploy (should ignore stale WAL) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-stale-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +Stale WAL deleted (expected) +=== State file should NOT contain stale_job === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/stale-wal/script b/acceptance/bundle/deploy/wal/stale-wal/script new file mode 100644 index 00000000000..d814639a00e --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/script @@ -0,0 +1,40 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating state file (serial=2) ===" +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "stale-test-lineage", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating stale WAL with old serial (serial=1) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"stale-test-lineage","serial":1} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} +EOF + +echo "=== WAL content before deploy ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should ignore stale WAL) ===" +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "Stale WAL deleted (expected)" +fi + +echo "=== State file should NOT contain stale_job ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/stale-wal/test.py b/acceptance/bundle/deploy/wal/stale-wal/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/stale-wal/test.toml b/acceptance/bundle/deploy/wal/stale-wal/test.toml new file mode 100644 index 00000000000..934683ba6d8 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/test.toml @@ -0,0 +1,9 @@ +# Deploy with a stale WAL (old serial) - WAL should be deleted and ignored. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml new file mode 100644 index 00000000000..063faa8e546 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-summary-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt new file mode 100644 index 00000000000..2e6abf645ae --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -0,0 +1,25 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== Bundle summary (should show job from WAL with id) === + +>>> [CLI] bundle summary -o json +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +{ + "job_id": "[ID]", + "modified_status": null +} diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script new file mode 100644 index 00000000000..d2017c65907 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/script @@ -0,0 +1,11 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"summary-test-lineage","serial":1} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== Bundle summary (should show job from WAL with id) ===" +trace $CLI bundle summary -o json | jq '{job_id: .resources.jobs.test_job.id, modified_status: .resources.jobs.test_job.modified_status}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.py b/acceptance/bundle/deploy/wal/summary-after-crash/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml new file mode 100644 index 00000000000..3363a1c516e --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -0,0 +1,2 @@ +# Bundle summary should show resources recovered from WAL. +# No server stubs needed - we just run bundle summary which reads state. diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml new file mode 100644 index 00000000000..7fd1daf93bd --- /dev/null +++ b/acceptance/bundle/deploy/wal/test.toml @@ -0,0 +1,43 @@ +# WAL (Write-Ahead Log) tests verify crash recovery during bundle deployment. +# These tests simulate process crashes using KillCaller and verify state recovery. +# Only runs with direct engine since WAL is a direct-engine feature. + +Local = true +Env.DATABRICKS_CLI_TEST_PID = "1" + +[EnvMatrix] +DATABRICKS_BUNDLE_ENGINE = ["direct"] + +[[Repls]] +Old = 'script: line \d+:\s+\d+ Killed(: 9)?\s+"\$@"' +New = '[PROCESS_KILLED]' + +[[Repls]] +Old = '(\n>>> errcode [^\n]+\n)\nExit code:' +New = """${1}[PROCESS_KILLED] + +Exit code:""" + +[[Repls]] +Old = 'Exit code: (137|1)' +New = 'Exit code: [KILLED]' + +[[Repls]] +Old = "\r" +New = '' + +[[Repls]] +Old = '"lineage":\s*"[0-9a-f-]+"' +New = '"lineage": "[UUID]"' + +[[Repls]] +Old = '"serial":\s*\d+' +New = '"serial": [SERIAL]' + +[[Repls]] +Old = '"__id__":\s*"\d+"' +New = '"__id__": "[ID]"' + +[[Repls]] +Old = '"job_id":\s*"\d+"' +New = '"job_id": "[ID]"' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml new file mode 100644 index 00000000000..457a2d3e964 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-delete-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt new file mode 100644 index 00000000000..8f52732d3e9 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -0,0 +1,21 @@ +=== Creating state directory === +=== Creating state file (job exists) === +=== Creating WAL with delete entry (simulating crash during delete) === +=== WAL content === +{"lineage":"delete-test-lineage","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":null} +=== Updating config to remove job === +=== Deploy (should recover delete from WAL) === + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (should have no jobs) === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script new file mode 100644 index 00000000000..f840355267c --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -0,0 +1,48 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating state file (job exists) ===" +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "delete-test-lineage", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with delete entry (simulating crash during delete) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"delete-test-lineage","serial":2} +{"k":"resources.jobs.test_job","v":null} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Updating config to remove job ===" +cat > databricks.yml << 'EOF' +bundle: + name: wal-delete-test + +resources: {} +EOF + +echo "=== Deploy (should recover delete from WAL) ===" +trace $CLI bundle deploy + +echo "=== Final state (should have no jobs) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.py b/acceptance/bundle/deploy/wal/wal-with-delete/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml new file mode 100644 index 00000000000..27045f8885e --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml @@ -0,0 +1,5 @@ +# WAL recovery after crash during delete operation. +# Delete was recorded in WAL but not finalized. Deploy should complete the delete. + +# No server stubs needed - the delete was already done (recorded in WAL) +# and the job no longer needs API calls diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index ed5cbbc07bc..08d849d14c1 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,7 +62,7 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(statePath); err == nil { + if err := checkStateDB.Open(ctx, statePath); err == nil { if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -82,7 +82,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Open temp state - err := b.StateDB.Open(tmpStatePath) + err := b.StateDB.Open(ctx, tmpStatePath) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -188,7 +188,7 @@ func (result *BindResult) Cancel() { // Unbind removes a resource from direct engine state without deleting // the workspace resource. Also removes associated permissions/grants entries. func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey string) error { - err := b.StateDB.Open(statePath) + err := b.StateDB.Open(ctx, statePath) if err != nil { return err } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index a7f3ee65fc2..aec6e7cc523 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,7 +21,12 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - // Avoid creating state file if nothing to deploy + // Still need to finalize if WAL recovery happened to commit the recovered state + if b.StateDB.RecoveredFromWAL() { + if err := b.StateDB.Finalize(); err != nil { + logdiag.LogError(ctx, err) + } + } return } diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index f6bcea316cd..1fb70123b96 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 3f6bcce2fc5..9113021c8f1 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -14,15 +14,18 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) const currentStateVersion = 2 type DeploymentState struct { - Path string - Data Database - mu sync.Mutex + Path string + Data Database + mu sync.Mutex + wal *WAL + recoveredFromWAL bool } type Database struct { @@ -63,12 +66,22 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d return err } - db.Data.State[key] = ResourceEntry{ + entry := ResourceEntry{ ID: newID, State: json.RawMessage(jsonMessage), DependsOn: dependsOn, } + // Write to WAL before updating memory + if err := db.ensureWALOpen(); err != nil { + return fmt.Errorf("failed to open WAL: %w", err) + } + if err := db.wal.writeEntry(key, &entry); err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + + db.Data.State[key] = entry + return nil } @@ -81,11 +94,50 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } + // Write to WAL before updating memory (nil entry means delete) + if err := db.ensureWALOpen(); err != nil { + return fmt.Errorf("failed to open WAL: %w", err) + } + if err := db.wal.writeEntry(key, nil); err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + delete(db.Data.State, key) return nil } +// ensureWALOpen opens the WAL file and writes the header if not already done. +// Must be called while holding db.mu. +func (db *DeploymentState) ensureWALOpen() error { + if db.wal != nil { + return nil + } + + wal, err := openWAL(db.Path) + if err != nil { + return err + } + + // Generate lineage if this is a fresh deployment + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() + db.Data.Lineage = lineage + } + + // WAL serial is the NEXT serial (current + 1) + walSerial := db.Data.Serial + 1 + + if err := wal.writeHeader(lineage, walSerial); err != nil { + wal.close() + return err + } + + db.wal = wal + return nil +} + func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { db.AssertOpened() db.mu.Lock() @@ -110,7 +162,7 @@ func (db *DeploymentState) GetResourceID(key string) string { return entry.ID } -func (db *DeploymentState) Open(path string) error { +func (db *DeploymentState) Open(ctx context.Context, path string) error { db.mu.Lock() defer db.mu.Unlock() @@ -124,21 +176,39 @@ func (db *DeploymentState) Open(path string) error { // Create new database with serial=0, will be incremented to 1 in Finalize() db.Data = NewDatabase("", 0) db.Path = path - return nil + + // Write state file immediately to ensure it exists before any WAL operations. + // This guarantees we have a base state file for recovery validation. + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } + if err := db.unlockedSave(); err != nil { + return err + } + } else { + return err } - return err + } else { + err = json.Unmarshal(data, &db.Data) + if err != nil { + return err + } + db.Path = path } - err = json.Unmarshal(data, &db.Data) + // Attempt WAL recovery + recovered, err := recoverFromWAL(path, &db.Data) if err != nil { - return err + return fmt.Errorf("WAL recovery failed: %w", err) + } + if recovered { + log.Infof(ctx, "Recovered deployment state from WAL") + db.recoveredFromWAL = true } if err := migrateState(&db.Data); err != nil { return fmt.Errorf("migrating state %s: %w", path, err) } - - db.Path = path return nil } @@ -146,14 +216,33 @@ func (db *DeploymentState) Finalize() error { db.mu.Lock() defer db.mu.Unlock() - // Generate lineage on first save + // Generate lineage on first save (if WAL wasn't opened) if db.Data.Lineage == "" { db.Data.Lineage = uuid.New().String() } db.Data.Serial++ - return db.unlockedSave() + err := db.unlockedSave() + if err != nil { + return err + } + + // Truncate WAL after successful state file write + if db.wal != nil { + if err := db.wal.truncate(); err != nil { + return fmt.Errorf("failed to truncate WAL: %w", err) + } + db.wal = nil + } else { + // No WAL was opened, but we should still clean up any stale WAL file + wp := walPath(db.Path) + if err := os.Remove(wp); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove stale WAL file: %w", err) + } + } + + return nil } func (db *DeploymentState) AssertOpened() { @@ -162,6 +251,12 @@ func (db *DeploymentState) AssertOpened() { } } +// RecoveredFromWAL returns true if state was recovered from WAL during Open(). +// This is used to determine if Finalize() should be called even with an empty plan. +func (db *DeploymentState) RecoveredFromWAL() bool { + return db.recoveredFromWAL +} + func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { result := make(resourcestate.ExportedResourcesMap) for key, entry := range db.Data.State { diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go new file mode 100644 index 00000000000..700bfa24e2d --- /dev/null +++ b/bundle/direct/dstate/wal.go @@ -0,0 +1,218 @@ +package dstate + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "os" +) + +// WALHeader is the first entry in the WAL file, containing metadata for validation. +type WALHeader struct { + Lineage string `json:"lineage"` + Serial int `json:"serial"` +} + +// WALEntry represents a single state mutation in the WAL. +// For set operations, V is populated. For delete operations, V is nil. +type WALEntry struct { + K string `json:"k"` + V *ResourceEntry `json:"v,omitempty"` +} + +// WAL manages the Write-Ahead Log for deployment state recovery. +type WAL struct { + path string + file *os.File +} + +// walPath returns the WAL file path for a given state file path. +func walPath(statePath string) string { + return statePath + ".wal" +} + +// openWAL opens or creates a WAL file for writing. +func openWAL(statePath string) (*WAL, error) { + wp := walPath(statePath) + f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) + if err != nil { + return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) + } + return &WAL{path: wp, file: f}, nil +} + +// writeHeader writes the WAL header (lineage and serial) as the first entry. +func (w *WAL) writeHeader(lineage string, serial int) error { + header := WALHeader{ + Lineage: lineage, + Serial: serial, + } + return w.writeJSON(header) +} + +// writeEntry appends a state mutation entry to the WAL. +func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { + walEntry := WALEntry{ + K: key, + V: entry, + } + return w.writeJSON(walEntry) +} + +// writeJSON marshals and writes a JSON object as a single line, then syncs to disk. +func (w *WAL) writeJSON(v any) error { + data, err := json.Marshal(v) + if err != nil { + return fmt.Errorf("failed to marshal WAL entry: %w", err) + } + data = append(data, '\n') + + _, err = w.file.Write(data) + if err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + + err = w.file.Sync() + if err != nil { + return fmt.Errorf("failed to sync WAL file: %w", err) + } + + return nil +} + +// close closes the WAL file handle. +func (w *WAL) close() error { + if w.file != nil { + return w.file.Close() + } + return nil +} + +// truncate deletes the WAL file after successful finalization. +func (w *WAL) truncate() error { + if w.file != nil { + w.file.Close() + w.file = nil + } + err := os.Remove(w.path) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove WAL file %q: %w", w.path, err) + } + return nil +} + +// readWAL reads and parses an existing WAL file for recovery. +// Returns the header and entries, or an error if the WAL is invalid. +func readWAL(statePath string) (*WALHeader, []WALEntry, error) { + wp := walPath(statePath) + f, err := os.Open(wp) + if err != nil { + return nil, nil, err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + var header *WALHeader + var entries []WALEntry + lineNum := 0 + + for scanner.Scan() { + lineNum++ + line := scanner.Bytes() + if len(line) == 0 { + continue + } + + if header == nil { + // First line must be the header + var h WALHeader + if err := json.Unmarshal(line, &h); err != nil { + return nil, nil, fmt.Errorf("WAL line %d: failed to parse header: %w", lineNum, err) + } + header = &h + } else { + // Subsequent lines are entries + var e WALEntry + if err := json.Unmarshal(line, &e); err != nil { + // Skip corrupted lines silently - this is expected for partial writes + continue + } + if e.K == "" { + // Skip entries with empty keys + continue + } + entries = append(entries, e) + } + } + + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + } + + if header == nil { + return nil, nil, errors.New("WAL file is empty or missing header") + } + + return header, entries, nil +} + +// recoverFromWAL attempts to recover state from an existing WAL file. +// It validates the WAL against the current state and replays valid entries. +// Returns true if recovery was performed, false if no recovery needed. +func recoverFromWAL(statePath string, db *Database) (bool, error) { + wp := walPath(statePath) + + // Check if WAL exists + if _, err := os.Stat(wp); os.IsNotExist(err) { + return false, nil + } + + header, entries, err := readWAL(statePath) + if err != nil { + // If we can't read the WAL at all, delete it and proceed + os.Remove(wp) + return false, nil + } + + // Validate WAL serial against state serial + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + // Stale WAL - delete and proceed without recovery + os.Remove(wp) + return false, nil + } + + if header.Serial > expectedSerial { + // WAL is ahead of state - this indicates corruption + return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } + + // Validate lineage if both exist + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } + + // Adopt lineage from WAL if state doesn't have one + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } + + // Initialize state map if needed + if db.State == nil { + db.State = make(map[string]ResourceEntry) + } + + // Replay entries + for _, entry := range entries { + if entry.V != nil { + // Set operation + db.State[entry.K] = *entry.V + } else { + // Delete operation + delete(db.State, entry.K) + } + } + + return true, nil +} diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go new file mode 100644 index 00000000000..e475a92e9dc --- /dev/null +++ b/bundle/direct/dstate/wal_test.go @@ -0,0 +1,419 @@ +package dstate + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/databricks/cli/bundle/deployplan" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWALPath(t *testing.T) { + assert.Equal(t, "/path/to/state.json.wal", walPath("/path/to/state.json")) +} + +func TestWALWriteAndRead(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Open WAL for writing + wal, err := openWAL(statePath) + require.NoError(t, err) + + // Write header + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Write entries + entry1 := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry1) + require.NoError(t, err) + + entry2 := &ResourceEntry{ + ID: "67890", + State: json.RawMessage(`{"name":"job2"}`), + } + err = wal.writeEntry("resources.jobs.job2", entry2) + require.NoError(t, err) + + // Write a delete entry (nil value) + err = wal.writeEntry("resources.jobs.old_job", nil) + require.NoError(t, err) + + err = wal.close() + require.NoError(t, err) + + // Read WAL back + header, entries, err := readWAL(statePath) + require.NoError(t, err) + + assert.Equal(t, "test-lineage", header.Lineage) + assert.Equal(t, 1, header.Serial) + + require.Len(t, entries, 3) + + assert.Equal(t, "resources.jobs.job1", entries[0].K) + require.NotNil(t, entries[0].V) + assert.Equal(t, "12345", entries[0].V.ID) + + assert.Equal(t, "resources.jobs.job2", entries[1].K) + require.NotNil(t, entries[1].V) + assert.Equal(t, "67890", entries[1].V.ID) + + assert.Equal(t, "resources.jobs.old_job", entries[2].K) + assert.Nil(t, entries[2].V) +} + +func TestWALTruncate(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create WAL file + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Verify file exists + _, err = os.Stat(walFilePath) + require.NoError(t, err) + + // Truncate + err = wal.truncate() + require.NoError(t, err) + + // Verify file is removed + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_NoWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) +} + +func TestRecoverFromWAL_ValidWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with serial = 1 (expecting state serial 0 + 1) + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + entry := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 0 + db := NewDatabase("", 0) + + // Recover + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Verify state was recovered + assert.Equal(t, "test-lineage", db.Lineage) + require.Contains(t, db.State, "resources.jobs.job1") + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) +} + +func TestRecoverFromWAL_StaleWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create WAL with serial = 1 + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 2 (WAL is stale) + db := NewDatabase("test-lineage", 2) + + // Recover - should skip and delete WAL + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) + + // WAL should be deleted + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_FutureWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with serial = 5 + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 5) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 0 (WAL is from future - corrupted state) + db := NewDatabase("test-lineage", 0) + + // Recover - should fail + _, err = recoverFromWAL(statePath, &db) + assert.Error(t, err) + assert.Contains(t, err.Error(), "WAL serial (5) is ahead of expected (1)") +} + +func TestRecoverFromWAL_LineageMismatch(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with lineage A + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("lineage-A", 1) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with lineage B + db := NewDatabase("lineage-B", 0) + + // Recover - should fail + _, err = recoverFromWAL(statePath, &db) + assert.Error(t, err) + assert.Contains(t, err.Error(), "lineage") +} + +func TestRecoverFromWAL_DeleteOperation(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with delete operation + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Add an entry + entry := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry) + require.NoError(t, err) + + // Delete the entry + err = wal.writeEntry("resources.jobs.job1", nil) + require.NoError(t, err) + + err = wal.close() + require.NoError(t, err) + + // Create database + db := NewDatabase("", 0) + + // Recover + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Entry should NOT be present (deleted) + assert.NotContains(t, db.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALIntegration(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create deployment state + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + // Save some state + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + // WAL should exist + _, err = os.Stat(walFilePath) + require.NoError(t, err) + + // Read WAL to verify content + header, entries, err := readWAL(statePath) + require.NoError(t, err) + assert.Equal(t, 1, header.Serial) // serial + 1 + require.Len(t, entries, 1) + assert.Equal(t, "resources.jobs.job1", entries[0].K) + assert.Equal(t, "12345", entries[0].V.ID) + + // Finalize + err = db.Finalize() + require.NoError(t, err) + + // WAL should be deleted + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) + + // State file should exist with correct serial + data, err := os.ReadFile(statePath) + require.NoError(t, err) + var savedDB Database + err = json.Unmarshal(data, &savedDB) + require.NoError(t, err) + assert.Equal(t, 1, savedDB.Serial) + assert.Contains(t, savedDB.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create initial state file + initialDB := NewDatabase("test-lineage", 5) + initialDB.State["resources.jobs.existing"] = ResourceEntry{ + ID: "existing-id", + State: json.RawMessage(`{"name":"existing"}`), + } + data, err := json.Marshal(initialDB) + require.NoError(t, err) + err = os.WriteFile(statePath, data, 0o600) + require.NoError(t, err) + + // Create WAL with serial 6 (5 + 1) + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 6) + require.NoError(t, err) + entry := &ResourceEntry{ + ID: "new-id", + State: json.RawMessage(`{"name":"new"}`), + } + err = wal.writeEntry("resources.jobs.new", entry) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Open should recover from WAL + var db DeploymentState + err = db.Open(ctx, statePath) + require.NoError(t, err) + + // Both existing and new resources should be present + assert.Contains(t, db.Data.State, "resources.jobs.existing") + assert.Contains(t, db.Data.State, "resources.jobs.new") + assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) +} + +func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + // Add a resource + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + // Delete the resource + err = db.DeleteState("resources.jobs.job1") + require.NoError(t, err) + + // Read WAL to verify delete entry + _, entries, err := readWAL(statePath) + require.NoError(t, err) + + require.Len(t, entries, 2) + assert.Equal(t, "resources.jobs.job1", entries[1].K) + assert.Nil(t, entries[1].V) // nil means delete + + // Finalize + err = db.Finalize() + require.NoError(t, err) + + // State file should NOT contain the deleted resource + data, err := os.ReadFile(statePath) + require.NoError(t, err) + var savedDB Database + err = json.Unmarshal(data, &savedDB) + require.NoError(t, err) + assert.NotContains(t, savedDB.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALWithDependsOn(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + dependsOn := []deployplan.DependsOnEntry{ + {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, + } + + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) + require.NoError(t, err) + + // Read WAL + _, entries, err := readWAL(statePath) + require.NoError(t, err) + + require.Len(t, entries, 1) + require.NotNil(t, entries[0].V) + require.Len(t, entries[0].V.DependsOn, 1) + assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) +} + +func TestRecoverFromWAL_CorruptedLine(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Manually write WAL with corrupted line + content := `{"lineage":"test","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +not valid json +{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Should have recovered job1 and job2, skipping corrupted line + assert.Contains(t, db.State, "resources.jobs.job1") + assert.Contains(t, db.State, "resources.jobs.job2") +} + diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index a6f48d99fa2..75081de56e6 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted diff --git a/wal.txt b/wal.txt new file mode 100644 index 00000000000..d365ed56d8d --- /dev/null +++ b/wal.txt @@ -0,0 +1,205 @@ +Design Document: Write-Ahead Log (WAL) for Bundle Deployment State Recovery +1. Problem Statement +When databricks bundle deploy is interrupted, resources created before the interruption become orphaned. The CLI only writes the state file at the end of deployment via Finalize(). Any resources created mid-deployment are lost from tracking. + +Current behavior: +Deploy starts → Create Job A → Create Job B → [CRASH] → State file empty → Jobs A, B orphaned + +Impact: Orphaned resources exist in Databricks but are unknown to future deployments. Users accumulate duplicate resources, leading to confusion and unexpected costs. + +Scope: Direct deployment engine only. Terraform has its own state management. +2. Solution Overview +Implement a Write-Ahead Log (WAL) that records each state mutation to disk immediately after the corresponding API call succeeds. +On recovery, replay the WAL to restore partial deployment state. + +Proposed behavior: +Deploy starts → Create Job A → [WAL: A] → Create Job B → [WAL: A,B] → [CRASH] +Next deploy → Load state → Replay WAL → State has A,B → No duplicates +3. Detailed Design +3.1 File Structure +The WAL is stored locally alongside the existing state file. + +File Path +Description +~/.databricks/bundle/// +Root directory for the bundle's state data. +~/.databricks/bundle///resources.json +The committed state file (existing). +~/.databricks/bundle///resources.json.wal +The Write-Ahead Log file (new). + +3.2 WAL Entry Format +Each entry is a JSON object written as a single line (NDJSON format). The entry embeds the existing ResourceEntry structure for consistency with the state file. + +Field +Type +Description +Lineage (First Entry Only) +String +UUID matching the state file's lineage (for validation). +Serial (First Entry Only) +Integer +Deployment serial number (for validation). +k (2nd Entry Onwards) +String +Resource key (e.g., resources.jobs.my_job). +v (2nd Entry Onwards) +ResourceEntry +The state entry. Omitted for delete operations. + + +ResourceEntry structure (existing, reused): + +Field +Type +Description +__id__ +String +The unique ID assigned by the Databricks API. +state +Object +Full snapshot of the resource configuration. + + +Example WAL: +{"lineage":"abc-123"} +{"k":"resources.jobs.my_job","v":{"__id__":"1234567","state":{...}}} +{"k":"resources.jobs.old_job"} // no v means delete op +3.3 WAL Lifecycle +Phase +Action +Open +Create or open resources.json.wal. +Write +Append entry after each successful API call. +Truncate +Delete resources.json.wal after successful Finalize(). + + +Durability: Each entry must be flushed to disk (fsync) immediately after the successful API response before proceeding. +Known Limitation: There is a small window (~microseconds) between API success and WAL write where a crash would orphan the resource. This is unavoidable is acceptable. +3.4 Recovery Mechanism +Recovery occurs at the start of deployment if the WAL file exists. + +Check: If resources.json.wal exists, initiate recovery. +Load Base State: +If resources.json exists: load it (provides lineage and serial). We are making sure it exists by writing immediately once we open/create it in the Open() method +Otherwise: create fresh state with new lineage. +Read WAL: Parse all entries from resources.json.wal (already chronologically ordered). +Validate Entries: +WAL serial == state serial + 1: Valid — replay entries. +WAL serial < state serial + 1: Stale WAL — delete WAL file, proceed without recovery. +WAL serial > state serial + 1: Corrupted state — return error. +Replay: For each valid entry: +set: Add or overwrite the resource in memory. +delete: Remove the resource from memory. +Proceed: Use the resulting state as the starting point for deployment. +Finalize: On success, write resources.json and delete resources.json.wal. +3.5 Integration Points +Action +Location +Detail +Recovery Check +Open() in dstate/state.go +Check for the WAL file and replay before proceeding. +Write WAL Entry +SaveState() / DeleteState() +Append entry before updating memory. +Truncation +Finalize() +Delete WAL after successful state file write. + +3.6 Error Handling +Scenario +Behavior +WAL write fails +Return error, abort deployment. +Corrupted WAL line +Log warning, skip line, continue replay. +Lineage mismatch +Return error, abort deployment. +Stale serial +Delete WAL + +5. Testing Plan +Use acceptance tests. Add support for the crash caller process from the test server. +Key test cases: +Tests which compile and run real binary against testserver. + +Normal deploy — WAL created, used, deleted. +Crash after 1 resource — recovery works. +Fresh deploy with existing WAL — lineage adopted. +Stale WAL (old serial) — entries skipped. +Corrupted WAL line — skipped, rest recovered. +Bundle summary works after interrupted deploy and sees ids stored in WAL +7. Open Questions +# +Question +Proposed Answer +1 +Should WAL be pushed to remote? +Never + +5. Test Plan + +We should use acceptance tests which compile and run real binary against testerver + +5.1 Unit Tests - WAL File Operations +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| U01 | WAL path generation | walPath("resources.json") returns "resources.json.wal" | +| U02 | Write and read WAL | Header + entries written and read back correctly | +| U03 | Truncate WAL | File deleted from disk | +| U04 | Truncate non-existent WAL | No error returned | +| U05 | Read empty WAL | Returns error "WAL file is empty or missing header" | + +5.2 Unit Tests - WAL Recovery Logic +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| R01 | No WAL exists | recoverFromWAL returns (false, nil) | +| R02 | Valid WAL (serial = state+1) | Entries replayed, returns (true, nil) | +| R03 | Stale WAL (serial < state+1) | WAL deleted, returns (false, nil) | +| R04 | Future WAL (serial > state+1) | Returns error about corruption | +| R05 | Lineage mismatch | Returns error about lineage mismatch | +| R06 | Lineage adopted from WAL | If state has no lineage, WAL lineage is used | +| R07 | Delete operation replay | Entry removed from state map | +| R08 | Corrupted entry line | Skipped, other entries recovered | + +5.3 Unit Tests - Integration with DeploymentState +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| I01 | SaveState/DeleteState/Finalize flow | WAL created on first SaveState, entries written, truncated on Finalize, serial incremented | +| I02 | Finalize cleans stale WAL | If WAL file exists but wasn't opened this session, delete it | +| I03 | Open with existing WAL | Recovery performed before return | +| I04 | SaveState with DependsOn | DependsOn preserved in WAL entry | + +5.4 Acceptance Tests +| Test ID | Description | Steps | Expected Behavior | +|---------|-------------|-------|-------------------| +| A01 | Normal deploy | Deploy bundle with 2 resources | WAL created during deploy, deleted after Finalize | +| A02 | Crash recovery | 1. Deploy, crash after resource A created 2. Redeploy | Resource A recovered from WAL, resource B created, no duplicates | +| A03 | Bundle summary after crash | 1. Deploy, crash mid-deploy 2. Run bundle summary | Shows resources from WAL with correct IDs | + +5.5 Tests Implemented in wal_test.go +- TestWALPath (U01) +- TestWALWriteAndRead (U02) +- TestWALTruncate (U03, U04) +- TestRecoverFromWAL_NoWAL (R01) +- TestRecoverFromWAL_ValidWAL (R02) +- TestRecoverFromWAL_StaleWAL (R03) +- TestRecoverFromWAL_FutureWAL (R04) +- TestRecoverFromWAL_LineageMismatch (R05) +- TestRecoverFromWAL_DeleteOperation (R07) +- TestRecoverFromWAL_CorruptedLine (R08) +- TestDeploymentState_WALIntegration (I01) +- TestDeploymentState_WALRecoveryOnOpen (I03) +- TestDeploymentState_DeleteStateWritesWAL (I01) +- TestDeploymentState_WALWithDependsOn (I04) + +5.6 Tests Still Needed +| Test ID | Description | Priority | +|---------|-------------|----------| +| R06 | TestRecoverFromWAL_LineageAdoption (fresh state adopts WAL lineage) | High | +| I02 | TestDeploymentState_FinalizeCleansStaleWAL | Medium | +| U05 | TestReadEmptyWAL | Low | +| A01-A03 | Acceptance tests (require crash simulation infrastructure) | High | From e7da9d9bc46359bb79b96701939c52c33dd76f2e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 21:32:47 +0530 Subject: [PATCH 02/80] Updated tests and enhanced kill caller with an offset Signed-off-by: Varun Deep Saini --- .../deploy/wal/chain-10-jobs/databricks.yml | 117 ++++++++++ .../deploy/wal/chain-10-jobs/out.test.toml | 5 + .../deploy/wal/chain-10-jobs/output.txt | 73 +++++++ .../bundle/deploy/wal/chain-10-jobs/script | 22 ++ .../bundle/deploy/wal/chain-10-jobs/test.py | 1 + .../bundle/deploy/wal/chain-10-jobs/test.toml | 17 ++ .../deploy/wal/corrupted-wal-entry/output.txt | 38 +--- .../deploy/wal/corrupted-wal-entry/script | 10 +- .../wal/corrupted-wal-entry/sort_warnings.py | 87 -------- .../deploy/wal/corrupted-wal-entry/test.toml | 3 +- .../wal/corrupted-wal-middle/databricks.yml | 25 +++ .../wal/corrupted-wal-middle/out.test.toml | 5 + .../wal/corrupted-wal-middle/output.txt | 25 +++ .../deploy/wal/corrupted-wal-middle/script | 37 ++++ .../deploy/wal/corrupted-wal-middle/test.py | 1 + .../deploy/wal/corrupted-wal-middle/test.toml | 13 ++ .../wal/crash-after-create/databricks.yml | 12 + .../deploy/wal/crash-after-create/output.txt | 42 ++-- .../deploy/wal/crash-after-create/script | 26 ++- .../deploy/wal/crash-after-create/test.toml | 13 +- .../bundle/deploy/wal/empty-wal/output.txt | 17 +- .../deploy/wal/future-serial-wal/output.txt | 16 -- .../deploy/wal/lineage-mismatch/output.txt | 16 -- .../wal/multiple-crashes/databricks.yml | 18 +- .../deploy/wal/multiple-crashes/output.txt | 57 ++--- .../bundle/deploy/wal/multiple-crashes/script | 25 +-- .../deploy/wal/multiple-crashes/test.toml | 11 +- .../deploy/wal/normal-deploy/output.txt | 16 -- .../bundle/deploy/wal/stale-wal/output.txt | 16 -- .../wal/summary-after-crash/databricks.yml | 18 +- .../deploy/wal/summary-after-crash/output.txt | 44 ++-- .../deploy/wal/summary-after-crash/script | 26 ++- .../deploy/wal/summary-after-crash/test.toml | 16 +- acceptance/bundle/deploy/wal/test.toml | 5 + .../deploy/wal/wal-with-delete/test.toml | 4 +- acceptance/internal/config.go | 6 + acceptance/internal/prepare_server.go | 17 +- .../selftest/kill_caller/offset/out.test.toml | 5 + .../selftest/kill_caller/offset/output.txt | 33 +++ acceptance/selftest/kill_caller/offset/script | 17 ++ .../selftest/kill_caller/offset/test.toml | 11 + bundle/direct/dstate/state.go | 9 +- bundle/direct/dstate/wal.go | 108 ++++----- bundle/direct/dstate/wal_test.go | 206 ++++++++++++------ wal.txt | 205 ----------------- 45 files changed, 811 insertions(+), 683 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/output.txt create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/script create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/test.py create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/test.toml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/script create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml create mode 100644 acceptance/selftest/kill_caller/offset/out.test.toml create mode 100644 acceptance/selftest/kill_caller/offset/output.txt create mode 100644 acceptance/selftest/kill_caller/offset/script create mode 100644 acceptance/selftest/kill_caller/offset/test.toml delete mode 100644 wal.txt diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml new file mode 100644 index 00000000000..2652cdbed62 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml @@ -0,0 +1,117 @@ +bundle: + name: wal-chain-test + +resources: + jobs: + # Linear chain: job_01 -> job_02 -> ... -> job_10 + # Execution order: job_01 first, job_10 last + job_01: + name: "job-01" + description: "first in chain" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_02: + name: "job-02" + description: "depends on ${resources.jobs.job_01.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_03: + name: "job-03" + description: "depends on ${resources.jobs.job_02.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_04: + name: "job-04" + description: "depends on ${resources.jobs.job_03.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_05: + name: "job-05" + description: "depends on ${resources.jobs.job_04.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_06: + name: "job-06" + description: "depends on ${resources.jobs.job_05.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_07: + name: "job-07" + description: "depends on ${resources.jobs.job_06.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_08: + name: "job-08" + description: "depends on ${resources.jobs.job_07.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_09: + name: "job-09" + description: "depends on ${resources.jobs.job_08.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_10: + name: "job-10" + description: "depends on ${resources.jobs.job_09.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt new file mode 100644 index 00000000000..4c4d781c805 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -0,0 +1,73 @@ +=== First deploy (crashes on job_10) === + +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +=== WAL content after crash === +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}} +{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}} +{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}} +{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}} +{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}} +{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}} +{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}} +{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}} +{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}} + +=== Number of jobs saved in WAL === +9 + +=== Bundle summary (reads from WAL) === +Name: wal-chain-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default +Resources: + Jobs: + job_01: + Name: job-01 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_02: + Name: job-02 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_03: + Name: job-03 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_04: + Name: job-04 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_05: + Name: job-05 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_06: + Name: job-06 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_07: + Name: job-07 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_08: + Name: job-08 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_09: + Name: job-09 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_10: + Name: job-10 + URL: (not deployed) + +=== Second deploy (recovery) === + +>>> [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-10-jobs/script new file mode 100644 index 00000000000..6cf2dd32f04 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/script @@ -0,0 +1,22 @@ +echo "=== First deploy (crashes on job_10) ===" +trace errcode $CLI bundle deploy + +echo "" +echo "=== WAL content after crash ===" +cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" + +echo "" +echo "=== Number of jobs saved in WAL ===" +grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0" + +echo "" +echo "=== Bundle summary (reads from WAL) ===" +$CLI bundle summary + +echo "" +echo "=== Second deploy (recovery) ===" +trace $CLI bundle deploy --force-lock + +echo "" +echo "=== WAL after successful deploy ===" +cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.py b/acceptance/bundle/deploy/wal/chain-10-jobs/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml new file mode 100644 index 00000000000..c4308521be1 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml @@ -0,0 +1,17 @@ +# Linear chain: job_01 -> job_02 -> ... -> job_10 +# Let first 9 jobs/create succeed, then kill on the 10th + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +KillCallerOffset = 9 +KillCaller = 1 +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index 11926293327..f5e7f346d86 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,45 +1,13 @@ === Creating state file with serial 5 === -=== Creating WAL with corrupted entry === +=== Creating WAL with corrupted LAST entry === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} -not valid json - this line should be skipped {"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -=== Deploy (should recover valid entries, skip corrupted) === +not valid json - corrupted last line (partial write from crash) +=== Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.another_valid.tasks[0].new_cluster - in databricks.yml:23:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - -Warning: Single node cluster is not correctly configured - at resources.jobs.valid_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index d73595a6f4c..fc36ed754fe 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -10,19 +10,21 @@ cat > .databricks/bundle/default/resources.json << 'EOF' } EOF -echo "=== Creating WAL with corrupted entry ===" +echo "=== Creating WAL with corrupted LAST entry ===" +# Corrupted last line is expected (partial write from crash) and should be skipped. +# Valid entries before it should be recovered. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -not valid json - this line should be skipped {"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +not valid json - corrupted last line (partial write from crash) EOF echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Deploy (should recover valid entries, skip corrupted) ===" -trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py +echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" +trace $CLI bundle deploy 2>&1 echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py deleted file mode 100644 index 06a6a0e59cc..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 -"""Sort warning blocks in CLI output to make test output deterministic. - -Warning blocks look like: -Warning: Single node cluster is not correctly configured - at resources.jobs.XXX.tasks[0].new_cluster - in databricks.yml:NN:NN - -num_workers should be 0 only for single-node clusters... - spark_conf: - ... - custom_tags: - ... - -This script groups consecutive warning blocks, sorts them by job name, and outputs. -""" - -import re -import sys - - -def main(): - content = sys.stdin.read() - lines = content.split("\n") - - result = [] - i = 0 - - while i < len(lines): - line = lines[i] - - # Check if this is the start of a warning block - if line.startswith("Warning:"): - # Collect all consecutive warning blocks - warnings = [] - while i < len(lines) and ( - lines[i].startswith("Warning:") - or ( - warnings - and not lines[i].startswith("Uploading") - and not lines[i].startswith("Deploying") - and not lines[i].startswith(">>>") - and not lines[i].startswith("===") - ) - ): - # Collect one complete warning block - block = [] - if lines[i].startswith("Warning:"): - block.append(lines[i]) - i += 1 - # Collect until next Warning or end marker - while i < len(lines): - if lines[i].startswith("Warning:"): - break - if lines[i].startswith("Uploading") or lines[i].startswith("Deploying"): - break - if lines[i].startswith(">>>") or lines[i].startswith("==="): - break - block.append(lines[i]) - i += 1 - warnings.append(block) - else: - i += 1 - - # Sort warnings by the job name in "at resources.jobs.XXX" - def get_sort_key(block): - for line in block: - match = re.search(r"at resources\.jobs\.(\w+)", line) - if match: - return match.group(1) - return "" - - warnings.sort(key=get_sort_key) - - # Output sorted warnings - for block in warnings: - for line in block: - result.append(line) - else: - result.append(line) - i += 1 - - print("\n".join(result), end="") - - -if __name__ == "__main__": - main() diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 5bbe82835c6..9c9ab5a30bd 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -1,4 +1,4 @@ -# WAL with corrupted entry - valid entries should be recovered, corrupted skipped. +# WAL with corrupted LAST entry - valid entries should be recovered, corrupted last line skipped. [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -11,3 +11,4 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' + diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml new file mode 100644 index 00000000000..aef2c714ec7 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml @@ -0,0 +1,25 @@ +bundle: + name: wal-corrupted-middle-test + +resources: + jobs: + job_one: + name: "job-one" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_two: + name: "job-two" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml new file mode 100644 index 00000000000..54146af5645 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt new file mode 100644 index 00000000000..4396aade670 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -0,0 +1,25 @@ +=== Creating state file with serial 5 === +=== Creating WAL with corrupted MIDDLE entry === +=== WAL content === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} +not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} +=== Deploy (WAL should be deleted due to middle corruption) === + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... +Warn: Failed to read WAL file, deleting and proceeding: WAL line 3: corrupted entry in middle of WAL: invalid character 'o' in literal null (expecting 'u') +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (fresh deploy, not recovered from WAL) === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.job_one", + "resources.jobs.job_two" + ] +} +=== WAL after deploy === +WAL deleted (expected - due to middle corruption) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script new file mode 100644 index 00000000000..46dc1922d16 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script @@ -0,0 +1,37 @@ +echo "=== Creating state file with serial 5 ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-456", + "serial": 5, + "state": {} +} +EOF + +echo "=== Creating WAL with corrupted MIDDLE entry ===" +# Corruption in the middle is NOT expected (only last line can be partial write). +# This should cause WAL to be deleted entirely, no recovery. +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-456","serial":6} +{"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} +not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (WAL should be deleted due to middle corruption) ===" +trace $CLI bundle deploy 2>&1 + +echo "=== Final state (fresh deploy, not recovered from WAL) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' + +echo "=== WAL after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected - due to middle corruption)" +fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py new file mode 100644 index 00000000000..1ff8e07c707 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml new file mode 100644 index 00000000000..8aa40be8d70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -0,0 +1,13 @@ +# WAL with corrupted MIDDLE entry - WAL should be deleted, no recovery. +# Corruption in the middle is unexpected (not a partial write from crash). +# The entire WAL is discarded and a fresh deploy happens. + +# Since WAL is discarded, jobs will be created fresh (not recovered) +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 9999}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=9999" +Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' + diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml index ebee1d9699f..31480454c55 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -5,6 +5,7 @@ resources: jobs: job_a: name: "test-job-a" + description: "first job" tasks: - task_key: "task-a" spark_python_task: @@ -13,3 +14,14 @@ resources: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge num_workers: 0 + job_b: + name: "test-job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9c333263829..9ab9f4cf9ce 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -1,37 +1,33 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== WAL content before deploy === -{"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"name":"test-job-a"}}} -=== Deploy (should recover from WAL) === +=== First deploy (crashes after job_a create, before job_b) === ->>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.job_a.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== WAL should exist after crash === +WAL exists (expected) +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== State file after crash (should be empty) === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== Second deploy (should recover from WAL and complete) === +>>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... Deploying resources... Updating deployment state... Deployment complete! === State file after recovery === { - "lineage": "test-lineage-123", "serial": [SERIAL], "state_keys": [ - "resources.jobs.job_a" + "resources.jobs.job_a", + "resources.jobs.job_b" ] } === WAL file after successful deploy === diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index c583a5eead9..d09f6ab06eb 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,20 +1,22 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default +echo "=== First deploy (crashes after job_a create, before job_b) ===" +trace errcode $CLI bundle deploy -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":1} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"name":"test-job-a"}}} -EOF +echo "=== WAL should exist after crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (expected)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL missing (unexpected)" +fi -echo "=== WAL content before deploy ===" -cat .databricks/bundle/default/resources.json.wal +echo "=== State file after crash (should be empty) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' -echo "=== Deploy (should recover from WAL) ===" -trace $CLI bundle deploy +echo "=== Second deploy (should recover from WAL and complete) ===" +trace $CLI bundle deploy --force-lock echo "=== State file after recovery ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' echo "=== WAL file after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index 9e20bac15dc..5023224e577 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -1,5 +1,10 @@ -# WAL recovery after simulated crash. Job was created but state wasn't finalized. -# Deploy should recover job from WAL and update it. +# WAL recovery after real crash. First deploy creates job_a then crashes. +# Second deploy recovers from WAL and completes successfully. +# job_b depends on job_a, so jobs/get is called after job_a's SaveState. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -7,4 +12,6 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job-a"}}' +KillCaller = 1 +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 91a31fe3222..21b68510803 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -5,23 +5,8 @@ === Deploy (should handle empty WAL gracefully) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... +Warn: Failed to read WAL file, deleting and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index ffb03147dc7..b0e5bda5585 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -6,22 +6,6 @@ === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 2419e7a6129..7f6c3a89bd3 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -6,22 +6,6 @@ === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml index b4162d8fdf3..3dc96ed8560 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml @@ -3,10 +3,22 @@ bundle: resources: jobs: - test_job: - name: "test-job" + job_a: + name: "test-job-a" + description: "first job" tasks: - - task_key: "test-task" + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_b: + name: "test-job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" spark_python_task: python_file: ./test.py new_cluster: diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index 3e0426a628c..33dd984b742 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -1,54 +1,28 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== WAL content === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} -=== First deploy attempt (will crash during update) === +=== First deploy (crashes after job_a create) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after first crash === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} -=== Second deploy attempt (should succeed) === - ->>> [CLI] bundle deploy --force-lock -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 +WAL exists +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== Second deploy (crashes during job_a update) === -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== WAL after second crash === +WAL still exists +=== Third deploy (should succeed) === +>>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... Deploying resources... Updating deployment state... @@ -57,7 +31,8 @@ Deployment complete! { "serial": [SERIAL], "state_keys": [ - "resources.jobs.test_job" + "resources.jobs.job_a", + "resources.jobs.job_b" ] } === WAL after successful deploy === diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script index 795e4261e19..0adcd2a980e 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/script +++ b/acceptance/bundle/deploy/wal/multiple-crashes/script @@ -1,24 +1,21 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default - -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-456","serial":1} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF - -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== First deploy attempt (will crash during update) ===" +echo "=== First deploy (crashes after job_a create) ===" trace errcode $CLI bundle deploy echo "=== WAL after first crash ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists" cat .databricks/bundle/default/resources.json.wal fi -echo "=== Second deploy attempt (should succeed) ===" +echo "=== Second deploy (crashes during job_a update) ===" +trace errcode $CLI bundle deploy --force-lock + +echo "=== WAL after second crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL still exists" +fi + +echo "=== Third deploy (should succeed) ===" trace $CLI bundle deploy --force-lock echo "=== Final state ===" diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index 2e9973c8464..c5981d67208 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -1,4 +1,11 @@ -# Multiple crashes during recovery - WAL should persist until successful finalize. +# Multiple real crashes during deployment - WAL should persist until successful finalize. +# First deploy: crashes after job_a create (kill on jobs/get) +# Second deploy: crashes during job_a update (kill on jobs/reset) +# Third deploy: succeeds (both counters exhausted) + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -7,4 +14,6 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" +KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt index 50c1430641f..ccb189ff09b 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -1,21 +1,5 @@ >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index 3722788e52d..682534de7ce 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -7,22 +7,6 @@ === Deploy (should ignore stale WAL) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-stale-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml index 063faa8e546..86376fd7baf 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml @@ -3,10 +3,22 @@ bundle: resources: jobs: - test_job: - name: "test-job" + job_a: + name: "job-a" + description: "first job" tasks: - - task_key: "test-task" + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_b: + name: "job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" spark_python_task: python_file: ./test.py new_cluster: diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 2e6abf645ae..9a2644a60b8 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -1,25 +1,29 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== Bundle summary (should show job from WAL with id) === +=== Deploy (job_a created and saved, then crash on jobs/get) === ->>> [CLI] bundle summary -o json -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== State directory contents after crash === +deployment.json +resources.json +resources.json.wal +sync-snapshots +=== WAL should exist after crash === +WAL exists (expected) +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== State file after crash === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== Bundle summary (should show job_a from WAL) === +>>> [CLI] bundle summary -o json { - "job_id": "[ID]", - "modified_status": null + "job_a_id": "1001", + "job_b_id": null } diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script index d2017c65907..3b007062c60 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/script +++ b/acceptance/bundle/deploy/wal/summary-after-crash/script @@ -1,11 +1,19 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default +echo "=== Deploy (job_a created and saved, then crash on jobs/get) ===" +trace errcode $CLI bundle deploy -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"summary-test-lineage","serial":1} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +echo "=== State directory contents after crash ===" +ls .databricks/bundle/default/ -echo "=== Bundle summary (should show job from WAL with id) ===" -trace $CLI bundle summary -o json | jq '{job_id: .resources.jobs.test_job.id, modified_status: .resources.jobs.test_job.modified_status}' +echo "=== WAL should exist after crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (expected)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL missing (unexpected)" +fi + +echo "=== State file after crash ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== Bundle summary (should show job_a from WAL) ===" +trace $CLI bundle summary -o json | jq '{job_a_id: .resources.jobs.job_a.id, job_b_id: .resources.jobs.job_b.id}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml index 3363a1c516e..961030e9816 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -1,2 +1,14 @@ -# Bundle summary should show resources recovered from WAL. -# No server stubs needed - we just run bundle summary which reads state. +# Bundle summary should show resources recovered from WAL after a real crash. +# job_b depends on job_a, so after job_a is created and SaveState is called, +# refreshRemoteState calls jobs/get to fetch job_a's state for job_b's reference. +# We kill on jobs/get - AFTER job_a's SaveState, so WAL contains job_a. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +KillCaller = 1 +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 7fd1daf93bd..1632ddb1957 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -41,3 +41,8 @@ New = '"__id__": "[ID]"' [[Repls]] Old = '"job_id":\s*"\d+"' New = '"job_id": "[ID]"' + +# Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) +[[Repls]] +Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml index 27045f8885e..4f81ae46952 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml @@ -1,5 +1,7 @@ -# WAL recovery after crash during delete operation. +# WAL recovery after crash during delete operation (simulated). # Delete was recorded in WAL but not finalized. Deploy should complete the delete. +# Note: Real crash testing for delete is not possible because there's no API call +# after DeleteState (unlike create which has refreshRemoteState after SaveState). # No server stubs needed - the delete was already done (recorded in WAL) # and the job no longer needs API calls diff --git a/acceptance/internal/config.go b/acceptance/internal/config.go index 06ac61c39b8..dc63911173c 100644 --- a/acceptance/internal/config.go +++ b/acceptance/internal/config.go @@ -159,6 +159,12 @@ type ServerStub struct { // Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds. // Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment. KillCaller int + + // Number of requests to let pass before starting to kill. + // Combined with KillCaller, this creates a window: requests 1 to Offset succeed, + // requests Offset+1 to Offset+KillCaller are killed, rest succeed. + // Example: KillCallerOffset=9, KillCaller=1 means let 9 requests pass, kill the 10th. + KillCallerOffset int } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index 8f18d1c61bc..dfa89ef7486 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,8 +183,9 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - // Track remaining kill counts per pattern (for KillCaller > 0) + // Track remaining kill counts and offset counts per pattern (for KillCaller > 0) killCounters := make(map[string]int) + offsetCounters := make(map[string]int) killCountersMu := &sync.Mutex{} for ind := range stubs { @@ -195,9 +196,10 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - // Initialize kill counter for this pattern + // Initialize kill counter and offset counter for this pattern if stub.KillCaller > 0 { killCounters[stub.Pattern] = stub.KillCaller + offsetCounters[stub.Pattern] = stub.KillCallerOffset } s.Handle(items[0], items[1], func(req testserver.Request) any { @@ -218,7 +220,7 @@ func startLocalServer(t *testing.T, } } - if shouldKillCaller(stub, killCounters, killCountersMu) { + if shouldKillCaller(stub, offsetCounters, killCounters, killCountersMu) { killCaller(t, stub.Pattern, req.Headers) } @@ -232,12 +234,19 @@ func startLocalServer(t *testing.T, return s.URL } -func shouldKillCaller(stub ServerStub, killCounters map[string]int, mu *sync.Mutex) bool { +func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]int, mu *sync.Mutex) bool { if stub.KillCaller <= 0 { return false } mu.Lock() defer mu.Unlock() + + // Still in offset period? Let this request pass. + if offsetCounters[stub.Pattern] > 0 { + offsetCounters[stub.Pattern]-- + return false + } + if killCounters[stub.Pattern] <= 0 { return false } diff --git a/acceptance/selftest/kill_caller/offset/out.test.toml b/acceptance/selftest/kill_caller/offset/out.test.toml new file mode 100644 index 00000000000..d560f1de043 --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt new file mode 100644 index 00000000000..03407dd0d8a --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -0,0 +1,33 @@ + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 1 done - success (offset) + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 2 done - success (offset) + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 3 done - killed + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 4 done - killed + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 5 done - success (past kill window) diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script new file mode 100644 index 00000000000..3411e874806 --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/script @@ -0,0 +1,17 @@ +# First 2 attempts should succeed (offset period) +trace $CLI current-user me +echo "Attempt 1 done - success (offset)" + +trace $CLI current-user me +echo "Attempt 2 done - success (offset)" + +# Attempts 3-4 should be killed +trace errcode $CLI current-user me +echo "Attempt 3 done - killed" + +trace errcode $CLI current-user me +echo "Attempt 4 done - killed" + +# Attempt 5 should succeed again +trace $CLI current-user me +echo "Attempt 5 done - success (past kill window)" diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml new file mode 100644 index 00000000000..5eab09dbfaa --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/test.toml @@ -0,0 +1,11 @@ +# Let first 2 requests pass, kill next 2, then allow rest +[[Server]] +Pattern = "GET /api/2.0/preview/scim/v2/Me" +KillCallerOffset = 2 +KillCaller = 2 +Response.Body = ''' +{ + "id": "123", + "userName": "test@example.com" +} +''' diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9113021c8f1..1f8a705e054 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -61,7 +61,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } - jsonMessage, err := json.MarshalIndent(state, " ", " ") + jsonMessage, err := json.MarshalIndent(state, "", " ") if err != nil { return err } @@ -72,7 +72,6 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - // Write to WAL before updating memory if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } @@ -94,7 +93,6 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - // Write to WAL before updating memory (nil entry means delete) if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } @@ -119,7 +117,6 @@ func (db *DeploymentState) ensureWALOpen() error { return err } - // Generate lineage if this is a fresh deployment lineage := db.Data.Lineage if lineage == "" { lineage = uuid.New().String() @@ -196,8 +193,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { db.Path = path } - // Attempt WAL recovery - recovered, err := recoverFromWAL(path, &db.Data) + recovered, err := recoverFromWAL(ctx, path, &db.Data) if err != nil { return fmt.Errorf("WAL recovery failed: %w", err) } @@ -228,7 +224,6 @@ func (db *DeploymentState) Finalize() error { return err } - // Truncate WAL after successful state file write if db.wal != nil { if err := db.wal.truncate(); err != nil { return fmt.Errorf("failed to truncate WAL: %w", err) diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 700bfa24e2d..37dd1bffa27 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -2,37 +2,34 @@ package dstate import ( "bufio" + "context" "encoding/json" "errors" "fmt" "os" + + "github.com/databricks/cli/libs/log" ) -// WALHeader is the first entry in the WAL file, containing metadata for validation. type WALHeader struct { Lineage string `json:"lineage"` Serial int `json:"serial"` } -// WALEntry represents a single state mutation in the WAL. -// For set operations, V is populated. For delete operations, V is nil. type WALEntry struct { K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` + V *ResourceEntry `json:"v,omitempty"` // nil means delete } -// WAL manages the Write-Ahead Log for deployment state recovery. type WAL struct { path string file *os.File } -// walPath returns the WAL file path for a given state file path. func walPath(statePath string) string { return statePath + ".wal" } -// openWAL opens or creates a WAL file for writing. func openWAL(statePath string) (*WAL, error) { wp := walPath(statePath) f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) @@ -42,7 +39,6 @@ func openWAL(statePath string) (*WAL, error) { return &WAL{path: wp, file: f}, nil } -// writeHeader writes the WAL header (lineage and serial) as the first entry. func (w *WAL) writeHeader(lineage string, serial int) error { header := WALHeader{ Lineage: lineage, @@ -51,7 +47,6 @@ func (w *WAL) writeHeader(lineage string, serial int) error { return w.writeJSON(header) } -// writeEntry appends a state mutation entry to the WAL. func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { walEntry := WALEntry{ K: key, @@ -60,7 +55,6 @@ func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { return w.writeJSON(walEntry) } -// writeJSON marshals and writes a JSON object as a single line, then syncs to disk. func (w *WAL) writeJSON(v any) error { data, err := json.Marshal(v) if err != nil { @@ -73,15 +67,9 @@ func (w *WAL) writeJSON(v any) error { return fmt.Errorf("failed to write WAL entry: %w", err) } - err = w.file.Sync() - if err != nil { - return fmt.Errorf("failed to sync WAL file: %w", err) - } - return nil } -// close closes the WAL file handle. func (w *WAL) close() error { if w.file != nil { return w.file.Close() @@ -89,7 +77,6 @@ func (w *WAL) close() error { return nil } -// truncate deletes the WAL file after successful finalization. func (w *WAL) truncate() error { if w.file != nil { w.file.Close() @@ -102,9 +89,7 @@ func (w *WAL) truncate() error { return nil } -// readWAL reads and parses an existing WAL file for recovery. -// Returns the header and entries, or an error if the WAL is invalid. -func readWAL(statePath string) (*WALHeader, []WALEntry, error) { +func readWAL(ctx context.Context, statePath string) (*WALHeader, []WALEntry, error) { wp := walPath(statePath) f, err := os.Open(wp) if err != nil { @@ -113,103 +98,98 @@ func readWAL(statePath string) (*WALHeader, []WALEntry, error) { defer f.Close() scanner := bufio.NewScanner(f) - var header *WALHeader - var entries []WALEntry - lineNum := 0 - + var lines [][]byte for scanner.Scan() { - lineNum++ line := scanner.Bytes() if len(line) == 0 { continue } + lineCopy := make([]byte, len(line)) + copy(lineCopy, line) + lines = append(lines, lineCopy) + } + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + } - if header == nil { - // First line must be the header - var h WALHeader - if err := json.Unmarshal(line, &h); err != nil { - return nil, nil, fmt.Errorf("WAL line %d: failed to parse header: %w", lineNum, err) - } - header = &h - } else { - // Subsequent lines are entries - var e WALEntry - if err := json.Unmarshal(line, &e); err != nil { - // Skip corrupted lines silently - this is expected for partial writes + if len(lines) == 0 { + return nil, nil, errors.New("WAL file is empty") + } + + var header WALHeader + if err := json.Unmarshal(lines[0], &header); err != nil { + return nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + } + + var entries []WALEntry + for i := 1; i < len(lines); i++ { + lineNum := i + 1 + isLastLine := i == len(lines)-1 + + var e WALEntry + if err := json.Unmarshal(lines[i], &e); err != nil { + if isLastLine { + log.Debugf(ctx, "WAL line %d: skipping corrupted last entry: %v", lineNum, err) continue } - if e.K == "" { - // Skip entries with empty keys + return nil, nil, fmt.Errorf("WAL line %d: corrupted entry in middle of WAL: %w", lineNum, err) + } + + if e.K == "" { + if isLastLine { + log.Debugf(ctx, "WAL line %d: skipping last entry with empty key", lineNum) continue } - entries = append(entries, e) + return nil, nil, fmt.Errorf("WAL line %d: entry with empty key in middle of WAL", lineNum) } - } - - if err := scanner.Err(); err != nil { - return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) - } - if header == nil { - return nil, nil, errors.New("WAL file is empty or missing header") + entries = append(entries, e) } - return header, entries, nil + return &header, entries, nil } -// recoverFromWAL attempts to recover state from an existing WAL file. -// It validates the WAL against the current state and replays valid entries. -// Returns true if recovery was performed, false if no recovery needed. -func recoverFromWAL(statePath string, db *Database) (bool, error) { +func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { wp := walPath(statePath) - // Check if WAL exists if _, err := os.Stat(wp); os.IsNotExist(err) { return false, nil } - header, entries, err := readWAL(statePath) + header, entries, err := readWAL(ctx, statePath) if err != nil { - // If we can't read the WAL at all, delete it and proceed + log.Warnf(ctx, "Failed to read WAL file, deleting and proceeding: %v", err) os.Remove(wp) return false, nil } - // Validate WAL serial against state serial expectedSerial := db.Serial + 1 if header.Serial < expectedSerial { - // Stale WAL - delete and proceed without recovery + log.Debugf(ctx, "Deleting stale WAL (serial %d < expected %d)", header.Serial, expectedSerial) os.Remove(wp) return false, nil } if header.Serial > expectedSerial { - // WAL is ahead of state - this indicates corruption return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) } - // Validate lineage if both exist if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) } - // Adopt lineage from WAL if state doesn't have one if db.Lineage == "" && header.Lineage != "" { db.Lineage = header.Lineage } - // Initialize state map if needed if db.State == nil { db.State = make(map[string]ResourceEntry) } - // Replay entries for _, entry := range entries { if entry.V != nil { - // Set operation db.State[entry.K] = *entry.V } else { - // Delete operation delete(db.State, entry.K) } } diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index e475a92e9dc..9c2250c830e 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -20,15 +20,12 @@ func TestWALWriteAndRead(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Open WAL for writing wal, err := openWAL(statePath) require.NoError(t, err) - // Write header err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Write entries entry1 := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), @@ -43,15 +40,14 @@ func TestWALWriteAndRead(t *testing.T) { err = wal.writeEntry("resources.jobs.job2", entry2) require.NoError(t, err) - // Write a delete entry (nil value) err = wal.writeEntry("resources.jobs.old_job", nil) require.NoError(t, err) err = wal.close() require.NoError(t, err) - // Read WAL back - header, entries, err := readWAL(statePath) + ctx := context.Background() + header, entries, err := readWAL(ctx, statePath) require.NoError(t, err) assert.Equal(t, "test-lineage", header.Lineage) @@ -76,40 +72,37 @@ func TestWALTruncate(t *testing.T) { statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create WAL file wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Verify file exists _, err = os.Stat(walFilePath) require.NoError(t, err) - // Truncate err = wal.truncate() require.NoError(t, err) - // Verify file is removed _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } func TestRecoverFromWAL_NoWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") db := NewDatabase("", 0) - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.False(t, recovered) } func TestRecoverFromWAL_ValidWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with serial = 1 (expecting state serial 0 + 1) wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) @@ -124,26 +117,23 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 0 db := NewDatabase("", 0) - // Recover - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Verify state was recovered assert.Equal(t, "test-lineage", db.Lineage) require.Contains(t, db.State, "resources.jobs.job1") assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) } func TestRecoverFromWAL_StaleWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create WAL with serial = 1 wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) @@ -151,24 +141,21 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 2 (WAL is stale) - db := NewDatabase("test-lineage", 2) + db := NewDatabase("test-lineage", 2) // serial 2 makes WAL stale - // Recover - should skip and delete WAL - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.False(t, recovered) - // WAL should be deleted _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } func TestRecoverFromWAL_FutureWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with serial = 5 wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 5) @@ -176,20 +163,18 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 0 (WAL is from future - corrupted state) db := NewDatabase("test-lineage", 0) - // Recover - should fail - _, err = recoverFromWAL(statePath, &db) + _, err = recoverFromWAL(ctx, statePath, &db) assert.Error(t, err) - assert.Contains(t, err.Error(), "WAL serial (5) is ahead of expected (1)") + assert.Contains(t, err.Error(), "ahead of expected") } func TestRecoverFromWAL_LineageMismatch(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with lineage A wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("lineage-A", 1) @@ -197,26 +182,23 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with lineage B db := NewDatabase("lineage-B", 0) - // Recover - should fail - _, err = recoverFromWAL(statePath, &db) + _, err = recoverFromWAL(ctx, statePath, &db) assert.Error(t, err) assert.Contains(t, err.Error(), "lineage") } func TestRecoverFromWAL_DeleteOperation(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with delete operation wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Add an entry entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), @@ -224,22 +206,18 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { err = wal.writeEntry("resources.jobs.job1", entry) require.NoError(t, err) - // Delete the entry err = wal.writeEntry("resources.jobs.job1", nil) require.NoError(t, err) err = wal.close() require.NoError(t, err) - // Create database db := NewDatabase("", 0) - // Recover - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Entry should NOT be present (deleted) assert.NotContains(t, db.State, "resources.jobs.job1") } @@ -249,36 +227,29 @@ func TestDeploymentState_WALIntegration(t *testing.T) { statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create deployment state var db DeploymentState err := db.Open(ctx, statePath) require.NoError(t, err) - // Save some state err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) require.NoError(t, err) - // WAL should exist _, err = os.Stat(walFilePath) require.NoError(t, err) - // Read WAL to verify content - header, entries, err := readWAL(statePath) + header, entries, err := readWAL(ctx, statePath) require.NoError(t, err) - assert.Equal(t, 1, header.Serial) // serial + 1 + assert.Equal(t, 1, header.Serial) require.Len(t, entries, 1) assert.Equal(t, "resources.jobs.job1", entries[0].K) assert.Equal(t, "12345", entries[0].V.ID) - // Finalize err = db.Finalize() require.NoError(t, err) - // WAL should be deleted _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) - // State file should exist with correct serial data, err := os.ReadFile(statePath) require.NoError(t, err) var savedDB Database @@ -293,7 +264,6 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create initial state file initialDB := NewDatabase("test-lineage", 5) initialDB.State["resources.jobs.existing"] = ResourceEntry{ ID: "existing-id", @@ -304,7 +274,6 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { err = os.WriteFile(statePath, data, 0o600) require.NoError(t, err) - // Create WAL with serial 6 (5 + 1) wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 6) @@ -318,12 +287,10 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { err = wal.close() require.NoError(t, err) - // Open should recover from WAL var db DeploymentState err = db.Open(ctx, statePath) require.NoError(t, err) - // Both existing and new resources should be present assert.Contains(t, db.Data.State, "resources.jobs.existing") assert.Contains(t, db.Data.State, "resources.jobs.new") assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) @@ -338,27 +305,22 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { err := db.Open(ctx, statePath) require.NoError(t, err) - // Add a resource err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) require.NoError(t, err) - // Delete the resource err = db.DeleteState("resources.jobs.job1") require.NoError(t, err) - // Read WAL to verify delete entry - _, entries, err := readWAL(statePath) + _, entries, err := readWAL(ctx, statePath) require.NoError(t, err) require.Len(t, entries, 2) assert.Equal(t, "resources.jobs.job1", entries[1].K) - assert.Nil(t, entries[1].V) // nil means delete + assert.Nil(t, entries[1].V) - // Finalize err = db.Finalize() require.NoError(t, err) - // State file should NOT contain the deleted resource data, err := os.ReadFile(statePath) require.NoError(t, err) var savedDB Database @@ -383,8 +345,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) require.NoError(t, err) - // Read WAL - _, entries, err := readWAL(statePath) + _, entries, err := readWAL(ctx, statePath) require.NoError(t, err) require.Len(t, entries, 1) @@ -393,12 +354,12 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) } -func TestRecoverFromWAL_CorruptedLine(t *testing.T) { +func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Manually write WAL with corrupted line content := `{"lineage":"test","serial":1} {"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} not valid json @@ -408,12 +369,129 @@ not valid json require.NoError(t, err) db := NewDatabase("", 0) - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) + assert.Empty(t, db.State) + + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + content := `{"lineage":"test","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} +not valid json +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Should have recovered job1 and job2, skipping corrupted line assert.Contains(t, db.State, "resources.jobs.job1") assert.Contains(t, db.State, "resources.jobs.job2") + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) + assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) +} + +func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + initialDB := NewDatabase("test-lineage", 0) + data, err := json.Marshal(initialDB) + require.NoError(t, err) + err = os.WriteFile(statePath, data, 0o600) + require.NoError(t, err) + + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + err = wal.writeEntry("resources.jobs.job1", &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + var db DeploymentState + err = db.Open(ctx, statePath) + require.NoError(t, err) + + assert.True(t, db.RecoveredFromWAL()) +} + +func TestRecoverFromWAL_LineageAdoption(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + content := `{"lineage":"adopted-lineage","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) // empty lineage + recovered, err := recoverFromWAL(ctx, statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + assert.Equal(t, "adopted-lineage", db.Lineage) +} + +func TestReadWAL_EmptyFile(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + err := os.WriteFile(walFilePath, []byte(""), 0o600) + require.NoError(t, err) + + _, _, err = readWAL(ctx, statePath) + assert.Error(t, err) + assert.Contains(t, err.Error(), "empty") } +func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "111", map[string]string{"v": "1"}, nil) + require.NoError(t, err) + + err = db.DeleteState("resources.jobs.job1") + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) + require.NoError(t, err) + + _, entries, err := readWAL(ctx, statePath) + require.NoError(t, err) + require.Len(t, entries, 3) + assert.Equal(t, "111", entries[0].V.ID) + assert.Nil(t, entries[1].V) + assert.Equal(t, "222", entries[2].V.ID) + + err = db.Finalize() + require.NoError(t, err) + + entry, ok := db.GetResourceEntry("resources.jobs.job1") + require.True(t, ok) + assert.Equal(t, "222", entry.ID) +} diff --git a/wal.txt b/wal.txt deleted file mode 100644 index d365ed56d8d..00000000000 --- a/wal.txt +++ /dev/null @@ -1,205 +0,0 @@ -Design Document: Write-Ahead Log (WAL) for Bundle Deployment State Recovery -1. Problem Statement -When databricks bundle deploy is interrupted, resources created before the interruption become orphaned. The CLI only writes the state file at the end of deployment via Finalize(). Any resources created mid-deployment are lost from tracking. - -Current behavior: -Deploy starts → Create Job A → Create Job B → [CRASH] → State file empty → Jobs A, B orphaned - -Impact: Orphaned resources exist in Databricks but are unknown to future deployments. Users accumulate duplicate resources, leading to confusion and unexpected costs. - -Scope: Direct deployment engine only. Terraform has its own state management. -2. Solution Overview -Implement a Write-Ahead Log (WAL) that records each state mutation to disk immediately after the corresponding API call succeeds. -On recovery, replay the WAL to restore partial deployment state. - -Proposed behavior: -Deploy starts → Create Job A → [WAL: A] → Create Job B → [WAL: A,B] → [CRASH] -Next deploy → Load state → Replay WAL → State has A,B → No duplicates -3. Detailed Design -3.1 File Structure -The WAL is stored locally alongside the existing state file. - -File Path -Description -~/.databricks/bundle/// -Root directory for the bundle's state data. -~/.databricks/bundle///resources.json -The committed state file (existing). -~/.databricks/bundle///resources.json.wal -The Write-Ahead Log file (new). - -3.2 WAL Entry Format -Each entry is a JSON object written as a single line (NDJSON format). The entry embeds the existing ResourceEntry structure for consistency with the state file. - -Field -Type -Description -Lineage (First Entry Only) -String -UUID matching the state file's lineage (for validation). -Serial (First Entry Only) -Integer -Deployment serial number (for validation). -k (2nd Entry Onwards) -String -Resource key (e.g., resources.jobs.my_job). -v (2nd Entry Onwards) -ResourceEntry -The state entry. Omitted for delete operations. - - -ResourceEntry structure (existing, reused): - -Field -Type -Description -__id__ -String -The unique ID assigned by the Databricks API. -state -Object -Full snapshot of the resource configuration. - - -Example WAL: -{"lineage":"abc-123"} -{"k":"resources.jobs.my_job","v":{"__id__":"1234567","state":{...}}} -{"k":"resources.jobs.old_job"} // no v means delete op -3.3 WAL Lifecycle -Phase -Action -Open -Create or open resources.json.wal. -Write -Append entry after each successful API call. -Truncate -Delete resources.json.wal after successful Finalize(). - - -Durability: Each entry must be flushed to disk (fsync) immediately after the successful API response before proceeding. -Known Limitation: There is a small window (~microseconds) between API success and WAL write where a crash would orphan the resource. This is unavoidable is acceptable. -3.4 Recovery Mechanism -Recovery occurs at the start of deployment if the WAL file exists. - -Check: If resources.json.wal exists, initiate recovery. -Load Base State: -If resources.json exists: load it (provides lineage and serial). We are making sure it exists by writing immediately once we open/create it in the Open() method -Otherwise: create fresh state with new lineage. -Read WAL: Parse all entries from resources.json.wal (already chronologically ordered). -Validate Entries: -WAL serial == state serial + 1: Valid — replay entries. -WAL serial < state serial + 1: Stale WAL — delete WAL file, proceed without recovery. -WAL serial > state serial + 1: Corrupted state — return error. -Replay: For each valid entry: -set: Add or overwrite the resource in memory. -delete: Remove the resource from memory. -Proceed: Use the resulting state as the starting point for deployment. -Finalize: On success, write resources.json and delete resources.json.wal. -3.5 Integration Points -Action -Location -Detail -Recovery Check -Open() in dstate/state.go -Check for the WAL file and replay before proceeding. -Write WAL Entry -SaveState() / DeleteState() -Append entry before updating memory. -Truncation -Finalize() -Delete WAL after successful state file write. - -3.6 Error Handling -Scenario -Behavior -WAL write fails -Return error, abort deployment. -Corrupted WAL line -Log warning, skip line, continue replay. -Lineage mismatch -Return error, abort deployment. -Stale serial -Delete WAL - -5. Testing Plan -Use acceptance tests. Add support for the crash caller process from the test server. -Key test cases: -Tests which compile and run real binary against testserver. - -Normal deploy — WAL created, used, deleted. -Crash after 1 resource — recovery works. -Fresh deploy with existing WAL — lineage adopted. -Stale WAL (old serial) — entries skipped. -Corrupted WAL line — skipped, rest recovered. -Bundle summary works after interrupted deploy and sees ids stored in WAL -7. Open Questions -# -Question -Proposed Answer -1 -Should WAL be pushed to remote? -Never - -5. Test Plan - -We should use acceptance tests which compile and run real binary against testerver - -5.1 Unit Tests - WAL File Operations -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| U01 | WAL path generation | walPath("resources.json") returns "resources.json.wal" | -| U02 | Write and read WAL | Header + entries written and read back correctly | -| U03 | Truncate WAL | File deleted from disk | -| U04 | Truncate non-existent WAL | No error returned | -| U05 | Read empty WAL | Returns error "WAL file is empty or missing header" | - -5.2 Unit Tests - WAL Recovery Logic -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| R01 | No WAL exists | recoverFromWAL returns (false, nil) | -| R02 | Valid WAL (serial = state+1) | Entries replayed, returns (true, nil) | -| R03 | Stale WAL (serial < state+1) | WAL deleted, returns (false, nil) | -| R04 | Future WAL (serial > state+1) | Returns error about corruption | -| R05 | Lineage mismatch | Returns error about lineage mismatch | -| R06 | Lineage adopted from WAL | If state has no lineage, WAL lineage is used | -| R07 | Delete operation replay | Entry removed from state map | -| R08 | Corrupted entry line | Skipped, other entries recovered | - -5.3 Unit Tests - Integration with DeploymentState -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| I01 | SaveState/DeleteState/Finalize flow | WAL created on first SaveState, entries written, truncated on Finalize, serial incremented | -| I02 | Finalize cleans stale WAL | If WAL file exists but wasn't opened this session, delete it | -| I03 | Open with existing WAL | Recovery performed before return | -| I04 | SaveState with DependsOn | DependsOn preserved in WAL entry | - -5.4 Acceptance Tests -| Test ID | Description | Steps | Expected Behavior | -|---------|-------------|-------|-------------------| -| A01 | Normal deploy | Deploy bundle with 2 resources | WAL created during deploy, deleted after Finalize | -| A02 | Crash recovery | 1. Deploy, crash after resource A created 2. Redeploy | Resource A recovered from WAL, resource B created, no duplicates | -| A03 | Bundle summary after crash | 1. Deploy, crash mid-deploy 2. Run bundle summary | Shows resources from WAL with correct IDs | - -5.5 Tests Implemented in wal_test.go -- TestWALPath (U01) -- TestWALWriteAndRead (U02) -- TestWALTruncate (U03, U04) -- TestRecoverFromWAL_NoWAL (R01) -- TestRecoverFromWAL_ValidWAL (R02) -- TestRecoverFromWAL_StaleWAL (R03) -- TestRecoverFromWAL_FutureWAL (R04) -- TestRecoverFromWAL_LineageMismatch (R05) -- TestRecoverFromWAL_DeleteOperation (R07) -- TestRecoverFromWAL_CorruptedLine (R08) -- TestDeploymentState_WALIntegration (I01) -- TestDeploymentState_WALRecoveryOnOpen (I03) -- TestDeploymentState_DeleteStateWritesWAL (I01) -- TestDeploymentState_WALWithDependsOn (I04) - -5.6 Tests Still Needed -| Test ID | Description | Priority | -|---------|-------------|----------| -| R06 | TestRecoverFromWAL_LineageAdoption (fresh state adopts WAL lineage) | High | -| I02 | TestDeploymentState_FinalizeCleansStaleWAL | Medium | -| U05 | TestReadEmptyWAL | Low | -| A01-A03 | Acceptance tests (require crash simulation infrastructure) | High | From 8fcd7eb6afd35e91f0f347b3380178e41c544547 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 24 Jan 2026 00:51:38 +0530 Subject: [PATCH 03/80] Updated existing tests Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 ++++++ .../out.deploy.terraform.txt | 6 +++++ .../output.txt | 6 ----- .../script | 2 +- .../test.toml | 1 + .../build_and_files_whl/out.deploy.direct.txt | 8 +++++++ .../out.deploy.terraform.txt | 7 ++++++ .../artifacts/build_and_files_whl/output.txt | 7 ------ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 1 + .../shell/bash/out.deploy.direct.txt | 7 ++++++ .../shell/bash/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/bash/output.txt | 5 ---- acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ++++++ .../shell/basic/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/basic/output.txt | 5 ---- .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ++++++ .../shell/default/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/default/output.txt | 5 ---- .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ++++++ .../shell/sh/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/sh/output.txt | 5 ---- acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 +++++ .../empty-bundle/out.deploy.terraform.txt | 5 ++++ .../bundle/deploy/empty-bundle/output.txt | 5 ---- acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/wal/chain-10-jobs/test.toml | 1 - .../deploy/wal/corrupted-wal-entry/test.toml | 1 - .../deploy/wal/corrupted-wal-middle/test.toml | 1 - .../deploy/wal/crash-after-create/test.toml | 1 - .../deploy/wal/multiple-crashes/test.toml | 1 - .../deploy/wal/summary-after-crash/test.toml | 1 - .../bundle/scripts/out.deploy.direct.txt | 24 +++++++++++++++++++ .../bundle/scripts/out.deploy.terraform.txt | 23 ++++++++++++++++++ acceptance/bundle/scripts/output.txt | 23 ------------------ .../out.deploy.direct.txt | 18 ++++++++++++++ .../out.deploy.terraform.txt | 17 +++++++++++++ .../scripts/restricted-execution/output.txt | 17 ------------- .../scripts/restricted-execution/script | 2 +- acceptance/bundle/scripts/script | 2 +- .../out.deploy-one.direct.txt | 6 +++++ .../out.deploy-one.terraform.txt | 5 ++++ .../out.deploy-two.direct.txt | 6 +++++ .../out.deploy-two.terraform.txt | 5 ++++ .../deploy-artifact-path-type/output.txt | 10 -------- .../deploy-artifact-path-type/script | 4 ++-- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-config-file-count/output.txt | 5 ---- .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-mode/out.deploy-dev.direct.txt | 6 +++++ .../deploy-mode/out.deploy-dev.terraform.txt | 5 ++++ .../deploy-mode/out.deploy-prod.direct.txt | 12 ++++++++++ .../deploy-mode/out.deploy-prod.terraform.txt | 11 +++++++++ .../bundle/telemetry/deploy-mode/output.txt | 16 ------------- .../bundle/telemetry/deploy-mode/script | 4 ++-- .../deploy-target-count/out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../telemetry/deploy-target-count/output.txt | 5 ---- .../telemetry/deploy-target-count/script | 2 +- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-variable-count/output.txt | 5 ---- .../telemetry/deploy-variable-count/script | 2 +- .../out.deploy-one.direct.txt | 8 +++++++ .../out.deploy-one.terraform.txt | 7 ++++++ .../out.deploy-two.direct.txt | 8 +++++++ .../out.deploy-two.terraform.txt | 7 ++++++ .../telemetry/deploy-whl-artifacts/output.txt | 14 ----------- .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../sync_patterns/out.deploy.direct.txt | 6 +++++ .../sync_patterns/out.deploy.terraform.txt | 5 ++++ .../bundle/validate/sync_patterns/output.txt | 5 ---- .../bundle/validate/sync_patterns/script | 2 +- acceptance/cache/simple/out.deploy.direct.txt | 6 +++++ .../cache/simple/out.deploy.terraform.txt | 5 ++++ acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 3 +++ bundle/direct/dstate/state.go | 15 ++++++++++++ bundle/direct/dstate/wal_test.go | 1 + 84 files changed, 355 insertions(+), 164 deletions(-) create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt create mode 100644 acceptance/cache/simple/out.deploy.direct.txt create mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt new file mode 100644 index 00000000000..f75a5428b16 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt new file mode 100644 index 00000000000..8ec9c52db62 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6d24880e6c0..6c8bd962a56 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,10 +1,4 @@ ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index 883601185c9..fba3a777006 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index a0a680e9d19..b6c55dac31e 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,4 +1,5 @@ RecordRequests = true +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt new file mode 100644 index 00000000000..4039d5917e8 --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt @@ -0,0 +1,8 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt new file mode 100644 index 00000000000..9894e5b89ff --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt @@ -0,0 +1,7 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index b618de6b89a..d44a21b582a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,10 +7,3 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 2d7d63f7fec..9aa0d870e7a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a030353d571..a93d901b688 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1 +1,2 @@ RecordRequests = false +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt new file mode 100644 index 00000000000..f311959abdd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt new file mode 100644 index 00000000000..fa5d7b76bcd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index fa5d7b76bcd..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt new file mode 100644 index 00000000000..3a4ff9138ba --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt new file mode 100644 index 00000000000..b5e01c79e67 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index b5e01c79e67..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt new file mode 100644 index 00000000000..f311959abdd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt new file mode 100644 index 00000000000..fa5d7b76bcd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index fa5d7b76bcd..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt new file mode 100644 index 00000000000..98820986f53 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt new file mode 100644 index 00000000000..5117e6e9fc0 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 5117e6e9fc0..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt new file mode 100644 index 00000000000..81dddfcb9fc --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt new file mode 100644 index 00000000000..494f76c84fa --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 919accb661f..8498653a6e7 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! - >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index 775ccd0defc..b74818f1b1a 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml index c4308521be1..36076f3df5e 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml @@ -14,4 +14,3 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 9c9ab5a30bd..6245c198409 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' - diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml index 8aa40be8d70..ec6fa7b3f4f 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -10,4 +10,3 @@ Response.Body = '{"job_id": 9999}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=9999" Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' - diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index 5023224e577..eebad72de53 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -14,4 +14,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index c5981d67208..474177b8046 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,4 +16,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml index 961030e9816..f14cbbfcbc3 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1001}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt new file mode 100644 index 00000000000..037f609f944 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.direct.txt @@ -0,0 +1,24 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt new file mode 100644 index 00000000000..a3d9ba342c2 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.terraform.txt @@ -0,0 +1,23 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index 68afb2feccb..a39a0b0aa9b 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,26 +25,3 @@ Name: scripts Found 1 error Exit code: 1 - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt new file mode 100644 index 00000000000..d8fed9e4e6c --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt @@ -0,0 +1,18 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt new file mode 100644 index 00000000000..efcf1281cb7 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt @@ -0,0 +1,17 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index f377edba7cb..2186ac68f02 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,22 +1,5 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env - === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 7a3dcb068b4..2e31cce2eea 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index de07d277ea9..3acb85f9cd1 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy +trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt new file mode 100644 index 00000000000..0e133547de1 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt new file mode 100644 index 00000000000..65960fa86d5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt new file mode 100644 index 00000000000..120e5902015 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt new file mode 100644 index 00000000000..fabdebb399f --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index a03920c3fdc..69c6730b46a 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,14 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index d1a63928a67..4f3bd7c3cf4 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 909e8d6c705..1637965310c 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index c495bdcb071..7fbdd0e6776 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt new file mode 100644 index 00000000000..e86795abf5d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt new file mode 100644 index 00000000000..ee47fabbb63 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt new file mode 100644 index 00000000000..5957e33b910 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt new file mode 100644 index 00000000000..ac2e13efb95 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt @@ -0,0 +1,11 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 99e7fbb699a..89be65f1950 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,20 +1,4 @@ ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index f7257769ac1..0a9d57a1a43 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev +trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t prod +trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt new file mode 100644 index 00000000000..0e133547de1 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..65960fa86d5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 31581169f2c..9c59c430234 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 3022a2b5e49..6e9d2f7378c 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index be4840e69ef..e8580d71b39 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index dad762899a2..caaf8c1f39f 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt new file mode 100644 index 00000000000..f8db617c003 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt new file mode 100644 index 00000000000..048d0f07b50 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt new file mode 100644 index 00000000000..b786de11fed --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt new file mode 100644 index 00000000000..651d315f77c --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index a9b8ce4ae6e..ed89628d989 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,18 +1,4 @@ ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 078fa94cdd3..5bc513afb87 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index b35859d86a9..0c061fbe312 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,8 +20,3 @@ Validation OK! "." ] } - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index d2aae85444a..485556d28a6 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt new file mode 100644 index 00000000000..945da6d1443 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt new file mode 100644 index 00000000000..41cfbc2a2d3 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index a2907174bf3..524c077f460 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood +trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 08cabc87be6..75759db680d 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,6 +3,9 @@ Local = true RecordRequests = true +# Enable engine-specific output files +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" + # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1f8a705e054..9de21336336 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -240,6 +240,21 @@ func (db *DeploymentState) Finalize() error { return nil } +// Close closes the WAL file handle without finalizing or truncating. +// Use this in tests or when you need to abort without saving state. +func (db *DeploymentState) Close() error { + db.mu.Lock() + defer db.mu.Unlock() + + if db.wal != nil { + if err := db.wal.close(); err != nil { + return err + } + db.wal = nil + } + return nil +} + func (db *DeploymentState) AssertOpened() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index 9c2250c830e..fb4cab1a198 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -337,6 +337,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { var db DeploymentState err := db.Open(ctx, statePath) require.NoError(t, err) + t.Cleanup(func() { db.Close() }) dependsOn := []deployplan.DependsOnEntry{ {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, From 51f19743b386700a046b9ba8f2471e8586914e1e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 2 Feb 2026 20:00:58 +0530 Subject: [PATCH 04/80] test fixes Signed-off-by: Varun Deep Saini --- acceptance/bundle/artifacts/shell/bash/output.txt | 1 - acceptance/bundle/artifacts/shell/basic/output.txt | 1 - acceptance/bundle/artifacts/shell/default/output.txt | 1 - acceptance/bundle/artifacts/shell/sh/output.txt | 1 - acceptance/bundle/deploy/wal/empty-wal/test.toml | 2 +- 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index 8b137891791..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index 8b137891791..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index 8b137891791..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 8b137891791..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml index b97264c2bec..2624bdcd685 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -9,5 +9,5 @@ Pattern = "GET /api/2.2/jobs/get" Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' [[Repls]] -Old = '-rw[^ ]+ \d+ [^ ]+ [^ ]+ \d+ [A-Z][a-z]+ \d+ \d+:\d+' +Old = '-rw[^\s]+\s+\d+\s+[^\s]+\s+[^\s]+\s+\d+\s+[A-Z][a-z]+\s+\d+\s+\d+:\d+' New = '[FILE_INFO]' From 36ff7c41900b4c29a099450f3a5071949ea27f3a Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 7 Feb 2026 20:08:19 +0530 Subject: [PATCH 05/80] Fixes Signed-off-by: Varun Deep Saini --- .../artifacts/build_and_files_whl/test.toml | 2 - .../deploy/wal/chain-10-jobs/output.txt | 2 + .../deploy/wal/corrupted-wal-entry/output.txt | 8 +- .../deploy/wal/corrupted-wal-entry/script | 11 +- .../wal/corrupted-wal-middle/output.txt | 15 +- .../deploy/wal/corrupted-wal-middle/script | 18 +- .../deploy/wal/corrupted-wal-middle/test.toml | 6 +- .../deploy/wal/crash-after-create/output.txt | 2 + .../bundle/deploy/wal/empty-wal/output.txt | 4 +- acceptance/bundle/deploy/wal/empty-wal/script | 7 + .../bundle/deploy/wal/empty-wal/test.toml | 2 +- .../deploy/wal/multiple-crashes/output.txt | 3 +- .../deploy/wal/summary-after-crash/output.txt | 2 + .../deploy/wal/wal-with-delete/output.txt | 2 + bundle/direct/dstate/state.go | 55 ++- bundle/direct/dstate/wal.go | 324 +++++++++++++----- bundle/direct/dstate/wal_test.go | 120 +++++-- 17 files changed, 424 insertions(+), 159 deletions(-) diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a93d901b688..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1,2 +0,0 @@ -RecordRequests = false -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index 4c4d781c805..d391548fa87 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -23,6 +23,8 @@ Exit code: [KILLED] 9 === Bundle summary (reads from WAL) === +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 9 entries from WAL file. Name: wal-chain-test Target: default Workspace: diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f5e7f346d86..f7ebf7bfd25 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -4,11 +4,15 @@ {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -not valid json - corrupted last line (partial write from crash) +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted +Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -20,5 +24,7 @@ Deployment complete! "resources.jobs.valid_job" ] } +=== Corrupted WAL entries file === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index fc36ed754fe..dde17995da6 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -11,13 +11,13 @@ cat > .databricks/bundle/default/resources.json << 'EOF' EOF echo "=== Creating WAL with corrupted LAST entry ===" -# Corrupted last line is expected (partial write from crash) and should be skipped. +# Corrupted last line is expected (truncated JSON from crash) and should be skipped. # Valid entries before it should be recovered. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -not valid json - corrupted last line (partial write from crash) +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- EOF echo "=== WAL content ===" @@ -29,6 +29,13 @@ trace $CLI bundle deploy 2>&1 echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +echo "=== Corrupted WAL entries file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + cat .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Missing corrupted WAL entries file (unexpected)" +fi + echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index 4396aade670..bf9236c1f93 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -3,17 +3,20 @@ === WAL content === {"lineage":"test-lineage-456","serial": [SERIAL]} {"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} -not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- {"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} -=== Deploy (WAL should be deleted due to middle corruption) === +=== Deploy (should recover valid entries and skip corrupted line) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Warn: Failed to read WAL file, deleting and proceeding: WAL line 3: corrupted entry in middle of WAL: invalid character 'o' in literal null (expecting 'u') +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted +Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! -=== Final state (fresh deploy, not recovered from WAL) === +=== Final state (should have recovered entries) === { "serial": [SERIAL], "state_keys": [ @@ -21,5 +24,7 @@ Deployment complete! "resources.jobs.job_two" ] } +=== Corrupted WAL entries file === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === WAL after deploy === -WAL deleted (expected - due to middle corruption) +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script index 46dc1922d16..6307d7fbf73 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script @@ -11,27 +11,33 @@ cat > .databricks/bundle/default/resources.json << 'EOF' EOF echo "=== Creating WAL with corrupted MIDDLE entry ===" -# Corruption in the middle is NOT expected (only last line can be partial write). -# This should cause WAL to be deleted entirely, no recovery. +# Corrupted middle line is expected (truncated JSON from crash) and should be skipped. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-456","serial":6} {"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} -not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- {"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} EOF echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Deploy (WAL should be deleted due to middle corruption) ===" +echo "=== Deploy (should recover valid entries and skip corrupted line) ===" trace $CLI bundle deploy 2>&1 -echo "=== Final state (fresh deploy, not recovered from WAL) ===" +echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +echo "=== Corrupted WAL entries file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + cat .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Missing corrupted WAL entries file (unexpected)" +fi + echo "=== WAL after deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" else - echo "WAL deleted (expected - due to middle corruption)" + echo "WAL deleted (expected)" fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml index ec6fa7b3f4f..d5f0b1bbb65 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -1,8 +1,6 @@ -# WAL with corrupted MIDDLE entry - WAL should be deleted, no recovery. -# Corruption in the middle is unexpected (not a partial write from crash). -# The entire WAL is discarded and a fresh deploy happens. +# WAL with corrupted MIDDLE entry - valid entries are recovered and corrupted entries are skipped. -# Since WAL is discarded, jobs will be created fresh (not recovered) +# Since valid entries are recovered, jobs will be updated (not created fresh). [[Server]] Pattern = "POST /api/2.2/jobs/create" Response.Body = '{"job_id": 9999}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9ab9f4cf9ce..e32c251ae4e 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -19,6 +19,8 @@ WAL exists (expected) >>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 21b68510803..e8e1553df78 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -6,12 +6,14 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... -Warn: Failed to read WAL file, deleting and proceeding: WAL file is empty +Warn: Failed to read WAL file, moved it to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) +=== Corrupted WAL file === +[FILE_INFO] .databricks/bundle/default/resources.json.wal.corrupted === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index f693753ac77..2c66d213aab 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -17,5 +17,12 @@ else echo "Empty WAL deleted (expected)" fi +echo "=== Corrupted WAL file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + ls -la .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Corrupted WAL file missing (unexpected)" +fi + echo "=== State file content ===" cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml index 2624bdcd685..ad64cd6e746 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -1,4 +1,4 @@ -# Empty WAL file should be deleted and deploy should proceed normally. +# Empty WAL file should be moved to .wal.corrupted and deploy should proceed normally. [[Server]] Pattern = "POST /api/2.2/jobs/create" diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index 33dd984b742..e31643106bd 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -14,12 +14,13 @@ WAL exists >>> errcode [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after second crash === -WAL still exists === Third deploy (should succeed) === >>> [CLI] bundle deploy --force-lock diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 9a2644a60b8..3f5747ab212 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -23,6 +23,8 @@ WAL exists (expected) === Bundle summary (should show job_a from WAL) === >>> [CLI] bundle summary -o json +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. { "job_a_id": "1001", "job_b_id": null diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index 8f52732d3e9..f686ac48369 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,6 +9,8 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9de21336336..a54da010f17 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -14,7 +14,6 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" - "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) @@ -75,7 +74,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } - if err := db.wal.writeEntry(key, &entry); err != nil { + if err := db.wal.writeJSON(WALEntry{K: key, V: &entry}); err != nil { return fmt.Errorf("failed to write WAL entry: %w", err) } @@ -96,7 +95,7 @@ func (db *DeploymentState) DeleteState(key string) error { if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } - if err := db.wal.writeEntry(key, nil); err != nil { + if err := db.wal.writeJSON(WALEntry{K: key}); err != nil { return fmt.Errorf("failed to write WAL entry: %w", err) } @@ -126,7 +125,7 @@ func (db *DeploymentState) ensureWALOpen() error { // WAL serial is the NEXT serial (current + 1) walSerial := db.Data.Serial + 1 - if err := wal.writeHeader(lineage, walSerial); err != nil { + if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { wal.close() return err } @@ -198,7 +197,12 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { return fmt.Errorf("WAL recovery failed: %w", err) } if recovered { - log.Infof(ctx, "Recovered deployment state from WAL") + if err := db.unlockedSave(); err != nil { + return err + } + if err := cleanupWAL(path); err != nil { + return err + } db.recoveredFromWAL = true } @@ -212,28 +216,43 @@ func (db *DeploymentState) Finalize() error { db.mu.Lock() defer db.mu.Unlock() - // Generate lineage on first save (if WAL wasn't opened) + hadOpenWAL := db.wal != nil + if hadOpenWAL { + if err := db.wal.close(); err != nil { + return err + } + db.wal = nil + + replayResult, err := replayWAL(db.Path, &db.Data) + if err != nil { + return fmt.Errorf("failed to replay WAL during finalize: %w", err) + } + if !replayResult.recovered { + return errors.New("failed to replay WAL during finalize: WAL file not found or stale") + } + if len(replayResult.corruptedEntries) > 0 { + first := replayResult.corruptedEntries[0] + return fmt.Errorf("failed to replay WAL during finalize: corrupted entry at line %d: %v", first.lineNumber, first.parseErr) + } + } + + if db.Data.Lineage == "" && !hadOpenWAL && len(db.Data.State) == 0 { + return nil + } + if db.Data.Lineage == "" { db.Data.Lineage = uuid.New().String() } db.Data.Serial++ - err := db.unlockedSave() - if err != nil { + if err := db.unlockedSave(); err != nil { return err } - if db.wal != nil { - if err := db.wal.truncate(); err != nil { - return fmt.Errorf("failed to truncate WAL: %w", err) - } - db.wal = nil - } else { - // No WAL was opened, but we should still clean up any stale WAL file - wp := walPath(db.Path) - if err := os.Remove(wp); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove stale WAL file: %w", err) + if hadOpenWAL { + if err := cleanupWAL(db.Path); err != nil { + return err } } diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 37dd1bffa27..494c181833e 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -2,12 +2,16 @@ package dstate import ( "bufio" + "bytes" "context" "encoding/json" "errors" "fmt" "os" + "path/filepath" + "strings" + "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" ) @@ -22,37 +26,40 @@ type WALEntry struct { } type WAL struct { - path string file *os.File } +type corruptedWALEntry struct { + lineNumber int + rawLine string + parseErr error +} + +type walReplayResult struct { + hasWAL bool + recovered bool + stale bool + entriesRecovered int + corruptedEntries []corruptedWALEntry +} + +var errWALRead = errors.New("wal read error") + func walPath(statePath string) string { return statePath + ".wal" } +func walCorruptedPath(statePath string) string { + return walPath(statePath) + ".corrupted" +} + func openWAL(statePath string) (*WAL, error) { wp := walPath(statePath) - f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) + f, err := os.OpenFile(wp, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) } - return &WAL{path: wp, file: f}, nil -} - -func (w *WAL) writeHeader(lineage string, serial int) error { - header := WALHeader{ - Lineage: lineage, - Serial: serial, - } - return w.writeJSON(header) -} - -func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { - walEntry := WALEntry{ - K: key, - V: entry, - } - return w.writeJSON(walEntry) + return &WAL{file: f}, nil } func (w *WAL) writeJSON(v any) error { @@ -67,6 +74,10 @@ func (w *WAL) writeJSON(v any) error { return fmt.Errorf("failed to write WAL entry: %w", err) } + if err := w.file.Sync(); err != nil { + return fmt.Errorf("failed to sync WAL entry: %w", err) + } + return nil } @@ -77,122 +88,267 @@ func (w *WAL) close() error { return nil } -func (w *WAL) truncate() error { - if w.file != nil { - w.file.Close() - w.file = nil - } - err := os.Remove(w.path) +func cleanupWAL(statePath string) error { + err := os.Remove(walPath(statePath)) if err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove WAL file %q: %w", w.path, err) + return fmt.Errorf("failed to remove WAL file %q: %w", walPath(statePath), err) + } + return nil +} + +func moveWALToCorrupted(statePath string) error { + source := walPath(statePath) + target := walCorruptedPath(statePath) + _ = os.Remove(target) + if err := os.Rename(source, target); err != nil { + return fmt.Errorf("failed to move WAL file %q to %q: %w", source, target, err) + } + return nil +} + +func writeCorruptedWALEntries(statePath string, corrupted []corruptedWALEntry) error { + if len(corrupted) == 0 { + return nil + } + + target := walCorruptedPath(statePath) + f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) + if err != nil { + return fmt.Errorf("failed to create corrupted WAL file %q: %w", target, err) + } + defer f.Close() + + for _, entry := range corrupted { + if _, err := f.WriteString(entry.rawLine + "\n"); err != nil { + return fmt.Errorf("failed to write corrupted WAL file %q: %w", target, err) + } + } + + if err := f.Sync(); err != nil { + return fmt.Errorf("failed to sync corrupted WAL file %q: %w", target, err) } + return nil } -func readWAL(ctx context.Context, statePath string) (*WALHeader, []WALEntry, error) { +func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, error) { wp := walPath(statePath) f, err := os.Open(wp) if err != nil { - return nil, nil, err + return nil, nil, nil, err } defer f.Close() scanner := bufio.NewScanner(f) - var lines [][]byte + scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) + var header *WALHeader + var entries []WALEntry + var corrupted []corruptedWALEntry + lineNumber := 0 for scanner.Scan() { - line := scanner.Bytes() + lineNumber++ + line := bytes.TrimSpace(scanner.Bytes()) if len(line) == 0 { continue } + lineCopy := make([]byte, len(line)) copy(lineCopy, line) - lines = append(lines, lineCopy) + if header == nil { + var h WALHeader + if err := json.Unmarshal(lineCopy, &h); err != nil { + return nil, nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + } + header = &h + continue + } + + var e WALEntry + if err := json.Unmarshal(lineCopy, &e); err != nil { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: err, + }) + continue + } + + if e.K == "" { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: errors.New("entry has empty key"), + }) + continue + } + + entries = append(entries, e) } + if err := scanner.Err(); err != nil { - return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + return nil, nil, nil, fmt.Errorf("failed to read WAL file: %w", err) } - if len(lines) == 0 { - return nil, nil, errors.New("WAL file is empty") + if header == nil { + return nil, nil, nil, errors.New("WAL file is empty") } - var header WALHeader - if err := json.Unmarshal(lines[0], &header); err != nil { - return nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + return header, entries, corrupted, nil +} + +func replayWAL(statePath string, db *Database) (walReplayResult, error) { + result := walReplayResult{} + wp := walPath(statePath) + + if _, err := os.Stat(wp); os.IsNotExist(err) { + return result, nil } + result.hasWAL = true - var entries []WALEntry - for i := 1; i < len(lines); i++ { - lineNum := i + 1 - isLastLine := i == len(lines)-1 + f, err := os.Open(wp) + if err != nil { + return result, fmt.Errorf("%w: %v", errWALRead, err) + } + defer f.Close() - var e WALEntry - if err := json.Unmarshal(lines[i], &e); err != nil { - if isLastLine { - log.Debugf(ctx, "WAL line %d: skipping corrupted last entry: %v", lineNum, err) - continue - } - return nil, nil, fmt.Errorf("WAL line %d: corrupted entry in middle of WAL: %w", lineNum, err) + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) + var header *WALHeader + lineNumber := 0 + var corrupted []corruptedWALEntry + for scanner.Scan() { + lineNumber++ + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue } - if e.K == "" { - if isLastLine { - log.Debugf(ctx, "WAL line %d: skipping last entry with empty key", lineNum) - continue + lineCopy := make([]byte, len(line)) + copy(lineCopy, line) + if header == nil { + var h WALHeader + if err := json.Unmarshal(lineCopy, &h); err != nil { + return result, fmt.Errorf("%w: failed to parse WAL header: %w", errWALRead, err) + } + header = &h + + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + result.stale = true + return result, nil + } + + if header.Serial > expectedSerial { + return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } + + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } + + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } + + if db.State == nil { + db.State = make(map[string]ResourceEntry) } - return nil, nil, fmt.Errorf("WAL line %d: entry with empty key in middle of WAL", lineNum) + continue } - entries = append(entries, e) + var entry WALEntry + if err := json.Unmarshal(lineCopy, &entry); err != nil { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: err, + }) + continue + } + + if entry.K == "" { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: errors.New("entry has empty key"), + }) + continue + } + + if entry.V != nil { + db.State[entry.K] = *entry.V + } else { + delete(db.State, entry.K) + } + result.entriesRecovered++ + } + + if err := scanner.Err(); err != nil { + return result, fmt.Errorf("%w: failed to read WAL file: %w", errWALRead, err) + } + + if header == nil { + return result, fmt.Errorf("%w: WAL file is empty", errWALRead) } - return &header, entries, nil + result.recovered = true + result.corruptedEntries = corrupted + return result, nil } func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { - wp := walPath(statePath) - - if _, err := os.Stat(wp); os.IsNotExist(err) { - return false, nil + replayResult, err := replayWAL(statePath, db) + if err != nil { + if errors.Is(err, errWALRead) { + if moveErr := moveWALToCorrupted(statePath); moveErr != nil { + return false, moveErr + } + log.Warnf(ctx, "Failed to read WAL file, moved it to %s and proceeding: %s", relativePathForLog(walCorruptedPath(statePath)), strings.TrimPrefix(err.Error(), errWALRead.Error()+": ")) + return false, nil + } + return false, err } - header, entries, err := readWAL(ctx, statePath) - if err != nil { - log.Warnf(ctx, "Failed to read WAL file, deleting and proceeding: %v", err) - os.Remove(wp) + if replayResult.stale { + log.Debugf(ctx, "Deleting stale WAL (serial behind current state)") + if err := cleanupWAL(statePath); err != nil { + return false, err + } return false, nil } - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - log.Debugf(ctx, "Deleting stale WAL (serial %d < expected %d)", header.Serial, expectedSerial) - os.Remove(wp) + if !replayResult.recovered { return false, nil } - if header.Serial > expectedSerial { - return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + logRecoveryProgress(ctx, fmt.Sprintf("Recovering state from WAL file: %s", relativePathForLog(walPath(statePath)))) + walLogPath := relativePathForLog(walPath(statePath)) + for _, corrupted := range replayResult.corruptedEntries { + log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) } - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { + return false, err } - - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage + if len(replayResult.corruptedEntries) > 0 { + log.Warnf(ctx, "Saved corrupted WAL entries to %s", relativePathForLog(walCorruptedPath(statePath))) } - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } + logRecoveryProgress(ctx, fmt.Sprintf("Recovered %d entries from WAL file.", replayResult.entriesRecovered)) + return true, nil +} - for _, entry := range entries { - if entry.V != nil { - db.State[entry.K] = *entry.V - } else { - delete(db.State, entry.K) - } +func relativePathForLog(path string) string { + rel, err := filepath.Rel(".", path) + if err != nil { + return path } + return filepath.ToSlash(rel) +} - return true, nil +func logRecoveryProgress(ctx context.Context, message string) { + defer func() { + _ = recover() + }() + cmdio.LogString(ctx, message) } diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index fb4cab1a198..9d4533eba73 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -12,10 +12,6 @@ import ( "github.com/stretchr/testify/require" ) -func TestWALPath(t *testing.T) { - assert.Equal(t, "/path/to/state.json.wal", walPath("/path/to/state.json")) -} - func TestWALWriteAndRead(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -23,31 +19,30 @@ func TestWALWriteAndRead(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry1 := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry1) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry1}) require.NoError(t, err) entry2 := &ResourceEntry{ ID: "67890", State: json.RawMessage(`{"name":"job2"}`), } - err = wal.writeEntry("resources.jobs.job2", entry2) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job2", V: entry2}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.old_job", nil) + err = wal.writeJSON(WALEntry{K: "resources.jobs.old_job", V: nil}) require.NoError(t, err) err = wal.close() require.NoError(t, err) - ctx := context.Background() - header, entries, err := readWAL(ctx, statePath) + header, entries, _, err := readWAL(statePath) require.NoError(t, err) assert.Equal(t, "test-lineage", header.Lineage) @@ -67,26 +62,41 @@ func TestWALWriteAndRead(t *testing.T) { assert.Nil(t, entries[2].V) } -func TestWALTruncate(t *testing.T) { +func TestCleanupWAL(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) _, err = os.Stat(walFilePath) require.NoError(t, err) - err = wal.truncate() + err = wal.close() + require.NoError(t, err) + err = cleanupWAL(statePath) require.NoError(t, err) _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } +func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + wal, err := openWAL(statePath) + require.NoError(t, err) + require.NoError(t, wal.close()) + + _, err = openWAL(statePath) + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to open WAL file") +} + func TestRecoverFromWAL_NoWAL(t *testing.T) { ctx := context.Background() dir := t.TempDir() @@ -105,14 +115,14 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -136,7 +146,7 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -158,7 +168,7 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 5) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 5}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -177,7 +187,7 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("lineage-A", 1) + err = wal.writeJSON(WALHeader{Lineage: "lineage-A", Serial: 1}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -196,17 +206,17 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.job1", nil) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: nil}) require.NoError(t, err) err = wal.close() @@ -237,7 +247,7 @@ func TestDeploymentState_WALIntegration(t *testing.T) { _, err = os.Stat(walFilePath) require.NoError(t, err) - header, entries, err := readWAL(ctx, statePath) + header, entries, _, err := readWAL(statePath) require.NoError(t, err) assert.Equal(t, 1, header.Serial) require.Len(t, entries, 1) @@ -276,13 +286,13 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 6) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 6}) require.NoError(t, err) entry := &ResourceEntry{ ID: "new-id", State: json.RawMessage(`{"name":"new"}`), } - err = wal.writeEntry("resources.jobs.new", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.new", V: entry}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -311,7 +321,7 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { err = db.DeleteState("resources.jobs.job1") require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 2) @@ -346,7 +356,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 1) @@ -372,11 +382,19 @@ not valid json db := NewDatabase("", 0) recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) - assert.False(t, recovered) - assert.Empty(t, db.State) + assert.True(t, recovered) + assert.Len(t, db.State, 2) + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) + assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) + corruptedPath := walCorruptedPath(statePath) + _, err = os.Stat(corruptedPath) + require.NoError(t, err) + contentBytes, err := os.ReadFile(corruptedPath) + require.NoError(t, err) + assert.Equal(t, "not valid json\n", string(contentBytes)) _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) + require.NoError(t, err) } func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { @@ -402,6 +420,13 @@ not valid json assert.Contains(t, db.State, "resources.jobs.job2") assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) + + corruptedPath := walCorruptedPath(statePath) + _, err = os.Stat(corruptedPath) + require.NoError(t, err) + contentBytes, err := os.ReadFile(corruptedPath) + require.NoError(t, err) + assert.Equal(t, "not valid json\n", string(contentBytes)) } func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { @@ -417,9 +442,9 @@ func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.job1", &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -451,7 +476,6 @@ func TestRecoverFromWAL_LineageAdoption(t *testing.T) { } func TestReadWAL_EmptyFile(t *testing.T) { - ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -459,7 +483,7 @@ func TestReadWAL_EmptyFile(t *testing.T) { err := os.WriteFile(walFilePath, []byte(""), 0o600) require.NoError(t, err) - _, _, err = readWAL(ctx, statePath) + _, _, _, err = readWAL(statePath) assert.Error(t, err) assert.Contains(t, err.Error(), "empty") } @@ -482,7 +506,7 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 3) assert.Equal(t, "111", entries[0].V.ID) @@ -496,3 +520,31 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { require.True(t, ok) assert.Equal(t, "222", entry.ID) } + +func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + f, err := os.OpenFile(walFilePath, os.O_WRONLY|os.O_APPEND, 0) + require.NoError(t, err) + _, err = f.WriteString("{\"k\":\"resources.jobs.partial_write\",\"v\":{\"__id__\":\"999\",\"state\":{\"name\":\"partial-\n") + require.NoError(t, err) + require.NoError(t, f.Sync()) + require.NoError(t, f.Close()) + + err = db.Finalize() + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to replay WAL during finalize: corrupted entry at line") + + _, err = os.Stat(walFilePath) + require.NoError(t, err) +} From 338ae0edab8e2094f738c8ffca08e08e5472c5e8 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 9 Feb 2026 23:51:25 +0530 Subject: [PATCH 06/80] fixed tests Signed-off-by: Varun Deep Saini --- .../bundle/artifacts/build_and_files_whl/test.toml | 1 + .../bundle/artifacts/shell/cmd/out.deploy.direct.txt | 7 +++++++ .../bundle/artifacts/shell/cmd/out.deploy.terraform.txt | 6 ++++++ acceptance/bundle/artifacts/shell/cmd/output.txt | 6 ------ acceptance/bundle/artifacts/shell/cmd/script | 2 +- acceptance/bundle/deploy/wal/test.toml | 9 +++++++++ bundle/direct/dstate/wal.go | 2 +- 7 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index e69de29bb2d..a030353d571 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -0,0 +1 @@ +RecordRequests = false diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt new file mode 100644 index 00000000000..e034bae7db3 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt new file mode 100644 index 00000000000..8ebed9f66d4 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/output.txt b/acceptance/bundle/artifacts/shell/cmd/output.txt index 8ebed9f66d4..e69de29bb2d 100644 --- a/acceptance/bundle/artifacts/shell/cmd/output.txt +++ b/acceptance/bundle/artifacts/shell/cmd/output.txt @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 1632ddb1957..df700645f7a 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -22,6 +22,15 @@ Exit code:""" Old = 'Exit code: (137|1)' New = 'Exit code: [KILLED]' +# On Windows, no bash "Killed" message appears when CLI has produced output before termination. +# Insert [PROCESS_KILLED] between last output line and exit code for consistency. +[[Repls]] +Old = '(Deploying resources\.\.\.)\n\nExit code: \[KILLED\]' +New = """${1} +[PROCESS_KILLED] + +Exit code: [KILLED]""" + [[Repls]] Old = "\r" New = '' diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 494c181833e..cd422c37dff 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -321,7 +321,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, return false, nil } - logRecoveryProgress(ctx, fmt.Sprintf("Recovering state from WAL file: %s", relativePathForLog(walPath(statePath)))) + logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) walLogPath := relativePathForLog(walPath(statePath)) for _, corrupted := range replayResult.corruptedEntries { log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) From ebb16ae8f0a2e1e060d6d78c6772826b382903a1 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Tue, 24 Mar 2026 23:50:01 +0530 Subject: [PATCH 07/80] updated tests Signed-off-by: Varun Deep Saini --- .../test.toml | 1 - .../shell/bash/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 .../bundle/artifacts/shell/basic/script | 2 +- .../artifacts/shell/cmd/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/cmd/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/sh/script | 2 +- acceptance/bundle/artifacts/shell/test.toml | 4 + .../deploy/wal/chain-10-jobs/output.txt | 359 +++++++++++++++++- .../bundle/deploy/wal/chain-10-jobs/script | 2 +- 19 files changed, 359 insertions(+), 52 deletions(-) delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/bash/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/basic/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/cmd/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/default/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/sh/{out.deploy.terraform.txt => out.deploy.txt} (100%) diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index b6c55dac31e..a0a680e9d19 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,5 +1,4 @@ RecordRequests = true -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt deleted file mode 100644 index f311959abdd..00000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/bash/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 09bb41643ca..eae08378509 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt deleted file mode 100644 index 3a4ff9138ba..00000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/basic/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 09bb41643ca..eae08378509 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt deleted file mode 100644 index e034bae7db3..00000000000 --- a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/cmd/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index 09bb41643ca..eae08378509 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt deleted file mode 100644 index f311959abdd..00000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/default/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 09bb41643ca..eae08378509 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt deleted file mode 100644 index 98820986f53..00000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/sh/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 09bb41643ca..eae08378509 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/test.toml b/acceptance/bundle/artifacts/shell/test.toml index 9796804e9a9..df72afb6c82 100644 --- a/acceptance/bundle/artifacts/shell/test.toml +++ b/acceptance/bundle/artifacts/shell/test.toml @@ -1,3 +1,7 @@ Local = true Cloud = false RecordRequests = false + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index d391548fa87..b172c4fc060 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -8,16 +8,355 @@ Deploying resources... Exit code: [KILLED] === WAL content after crash === -{"lineage":"[UUID]","serial": [SERIAL]} -{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}} -{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}} -{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}} -{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}} -{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}} -{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}} -{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}} -{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}} -{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}} +{ + "lineage": "[UUID]", + "serial": [SERIAL] +} +{ + "k": "resources.jobs.job_01", + "v": { + "__id__": "[ID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "first in chain", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-01", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_02", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_01.id}", + "node": "resources.jobs.job_01" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-02", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_03", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_02.id}", + "node": "resources.jobs.job_02" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-03", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_04", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_03.id}", + "node": "resources.jobs.job_03" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-04", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_05", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_04.id}", + "node": "resources.jobs.job_04" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-05", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_06", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_05.id}", + "node": "resources.jobs.job_05" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-06", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_07", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_06.id}", + "node": "resources.jobs.job_06" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-07", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_08", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_07.id}", + "node": "resources.jobs.job_07" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-08", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_09", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_08.id}", + "node": "resources.jobs.job_08" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-09", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} === Number of jobs saved in WAL === 9 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-10-jobs/script index 6cf2dd32f04..1f829232ad9 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/script @@ -3,7 +3,7 @@ trace errcode $CLI bundle deploy echo "" echo "=== WAL content after crash ===" -cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" +jq -S . .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" echo "" echo "=== Number of jobs saved in WAL ===" From 184d4a496ee15f1d3f09e233189481aaa4b29a35 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Wed, 25 Mar 2026 00:18:39 +0530 Subject: [PATCH 08/80] dedup Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 -- .../out.deploy.terraform.txt | 6 - .../output.txt | 6 + .../script | 2 +- .../test.toml | 4 + .../build_and_files_whl/out.deploy.direct.txt | 8 -- .../out.deploy.terraform.txt | 7 -- .../artifacts/build_and_files_whl/output.txt | 7 ++ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 4 + .../artifacts/shell/bash/out.deploy.txt | 6 - .../bundle/artifacts/shell/bash/output.txt | 6 + acceptance/bundle/artifacts/shell/bash/script | 2 +- .../artifacts/shell/basic/out.deploy.txt | 6 - .../bundle/artifacts/shell/basic/output.txt | 6 + .../bundle/artifacts/shell/basic/script | 2 +- .../bundle/artifacts/shell/cmd/out.deploy.txt | 6 - .../bundle/artifacts/shell/cmd/output.txt | 6 + acceptance/bundle/artifacts/shell/cmd/script | 2 +- .../artifacts/shell/default/out.deploy.txt | 6 - .../bundle/artifacts/shell/default/output.txt | 6 + .../bundle/artifacts/shell/default/script | 2 +- .../bundle/artifacts/shell/sh/out.deploy.txt | 6 - .../bundle/artifacts/shell/sh/output.txt | 6 + acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 - .../empty-bundle/out.deploy.terraform.txt | 5 - .../bundle/deploy/empty-bundle/output.txt | 5 + acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/empty-bundle/test.toml | 3 + .../deploy/wal/future-serial-wal/test.toml | 4 - .../deploy/wal/lineage-mismatch/test.toml | 4 - .../deploy/wal/wal-with-delete/test.toml | 7 -- .../bundle/scripts/out.deploy.direct.txt | 24 ---- .../bundle/scripts/out.deploy.terraform.txt | 23 ---- acceptance/bundle/scripts/output.txt | 23 ++++ .../out.deploy.direct.txt | 18 --- .../out.deploy.terraform.txt | 17 --- .../scripts/restricted-execution/output.txt | 17 +++ .../scripts/restricted-execution/script | 2 +- .../scripts/restricted-execution/test.toml | 3 + acceptance/bundle/scripts/script | 2 +- acceptance/bundle/scripts/test.toml | 3 + .../out.deploy-one.direct.txt | 6 - .../out.deploy-one.terraform.txt | 5 - .../out.deploy-two.direct.txt | 6 - .../out.deploy-two.terraform.txt | 5 - .../deploy-artifact-path-type/output.txt | 10 ++ .../deploy-artifact-path-type/script | 4 +- .../deploy-artifact-path-type/test.toml | 4 + .../out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../deploy-config-file-count/output.txt | 5 + .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-config-file-count/test.toml | 3 + .../deploy-mode/out.deploy-dev.direct.txt | 6 - .../deploy-mode/out.deploy-dev.terraform.txt | 5 - .../deploy-mode/out.deploy-prod.direct.txt | 12 -- .../deploy-mode/out.deploy-prod.terraform.txt | 11 -- .../bundle/telemetry/deploy-mode/output.txt | 16 +++ .../bundle/telemetry/deploy-mode/script | 4 +- .../bundle/telemetry/deploy-mode/test.toml | 3 + .../deploy-target-count/out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../telemetry/deploy-target-count/output.txt | 5 + .../telemetry/deploy-target-count/script | 2 +- .../telemetry/deploy-target-count/test.toml | 3 + .../out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../deploy-variable-count/output.txt | 5 + .../telemetry/deploy-variable-count/script | 2 +- .../telemetry/deploy-variable-count/test.toml | 4 + .../out.deploy-one.direct.txt | 8 -- .../out.deploy-one.terraform.txt | 7 -- .../out.deploy-two.direct.txt | 8 -- .../out.deploy-two.terraform.txt | 7 -- .../telemetry/deploy-whl-artifacts/output.txt | 14 +++ .../telemetry/deploy-whl-artifacts/script | 4 +- .../telemetry/deploy-whl-artifacts/test.toml | 4 + .../sync_patterns/out.deploy.direct.txt | 6 - .../sync_patterns/out.deploy.terraform.txt | 5 - .../bundle/validate/sync_patterns/output.txt | 5 + .../bundle/validate/sync_patterns/script | 2 +- .../bundle/validate/sync_patterns/test.toml | 4 + acceptance/cache/simple/out.deploy.direct.txt | 6 - .../cache/simple/out.deploy.terraform.txt | 5 - acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 7 +- acceptance/internal/prepare_server.go | 3 - bundle/direct/bundle_apply.go | 1 - bundle/direct/dstate/state.go | 29 ++--- bundle/direct/dstate/wal.go | 103 ++++-------------- bundle/direct/dstate/wal_test.go | 33 +++--- 93 files changed, 264 insertions(+), 451 deletions(-) delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.toml delete mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml delete mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.toml delete mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/test.toml create mode 100644 acceptance/bundle/scripts/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt delete mode 100644 acceptance/cache/simple/out.deploy.direct.txt delete mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt deleted file mode 100644 index f75a5428b16..00000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt deleted file mode 100644 index 8ec9c52db62..00000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6c8bd962a56..6d24880e6c0 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,4 +1,10 @@ +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index fba3a777006..883601185c9 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index a0a680e9d19..8185d0df6e5 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -8,6 +8,10 @@ Ignore = [ '*.whl', ] +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' + [[Server]] Pattern = "GET /api/2.1/clusters/get" Response.Body = ''' diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt deleted file mode 100644 index 4039d5917e8..00000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt deleted file mode 100644 index 9894e5b89ff..00000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index d44a21b582a..b618de6b89a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,3 +7,10 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 9aa0d870e7a..2d7d63f7fec 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a030353d571..8b65645e5a3 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1 +1,5 @@ RecordRequests = false + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.txt deleted file mode 100644 index fa5d7b76bcd..00000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index e69de29bb2d..fa5d7b76bcd 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index eae08378509..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.txt deleted file mode 100644 index b5e01c79e67..00000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index e69de29bb2d..b5e01c79e67 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index eae08378509..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt deleted file mode 100644 index 8ebed9f66d4..00000000000 --- a/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/output.txt b/acceptance/bundle/artifacts/shell/cmd/output.txt index e69de29bb2d..8ebed9f66d4 100644 --- a/acceptance/bundle/artifacts/shell/cmd/output.txt +++ b/acceptance/bundle/artifacts/shell/cmd/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index eae08378509..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.txt deleted file mode 100644 index fa5d7b76bcd..00000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index e69de29bb2d..fa5d7b76bcd 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index eae08378509..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.txt deleted file mode 100644 index 5117e6e9fc0..00000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index e69de29bb2d..5117e6e9fc0 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index eae08378509..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt deleted file mode 100644 index 81dddfcb9fc..00000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt deleted file mode 100644 index 494f76c84fa..00000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 8498653a6e7..919accb661f 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! + >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index b74818f1b1a..775ccd0defc 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/test.toml b/acceptance/bundle/deploy/empty-bundle/test.toml index f64800a1636..84da5529dc0 100644 --- a/acceptance/bundle/deploy/empty-bundle/test.toml +++ b/acceptance/bundle/deploy/empty-bundle/test.toml @@ -2,3 +2,6 @@ Cloud = true [EnvMatrix] DATABRICKS_BUNDLE_ENABLE_EXPERIMENTAL_YAML_SYNC = ["", "true"] +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml deleted file mode 100644 index 424fe2f1275..00000000000 --- a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL with serial ahead of state - indicates corruption, should error. -# State has serial=2, WAL has serial=5 (expected would be 3). - -# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml deleted file mode 100644 index 509cc82f095..00000000000 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL with different lineage than state - should error. -# State has lineage "state-lineage-aaa", WAL has lineage "wal-lineage-bbb". - -# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml deleted file mode 100644 index 4f81ae46952..00000000000 --- a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml +++ /dev/null @@ -1,7 +0,0 @@ -# WAL recovery after crash during delete operation (simulated). -# Delete was recorded in WAL but not finalized. Deploy should complete the delete. -# Note: Real crash testing for delete is not possible because there's no API call -# after DeleteState (unlike create which has refreshRemoteState after SaveState). - -# No server stubs needed - the delete was already done (recorded in WAL) -# and the job no longer needs API calls diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt deleted file mode 100644 index 037f609f944..00000000000 --- a/acceptance/bundle/scripts/out.deploy.direct.txt +++ /dev/null @@ -1,24 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt deleted file mode 100644 index a3d9ba342c2..00000000000 --- a/acceptance/bundle/scripts/out.deploy.terraform.txt +++ /dev/null @@ -1,23 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index a39a0b0aa9b..68afb2feccb 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,3 +25,26 @@ Name: scripts Found 1 error Exit code: 1 + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt deleted file mode 100644 index d8fed9e4e6c..00000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt +++ /dev/null @@ -1,18 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt deleted file mode 100644 index efcf1281cb7..00000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt +++ /dev/null @@ -1,17 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index 2186ac68f02..f377edba7cb 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,5 +1,22 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env + === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 2e31cce2eea..7a3dcb068b4 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/restricted-execution/test.toml b/acceptance/bundle/scripts/restricted-execution/test.toml new file mode 100644 index 00000000000..2a2e9c20339 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index 3acb85f9cd1..de07d277ea9 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace EXITCODE=0 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/test.toml b/acceptance/bundle/scripts/test.toml new file mode 100644 index 00000000000..2a2e9c20339 --- /dev/null +++ b/acceptance/bundle/scripts/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt deleted file mode 100644 index 0e133547de1..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt deleted file mode 100644 index 65960fa86d5..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt deleted file mode 100644 index 120e5902015..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt deleted file mode 100644 index fabdebb399f..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index 69c6730b46a..a03920c3fdc 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,4 +1,14 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index 4f3bd7c3cf4..d1a63928a67 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml index 32b75237a12..d4126948d39 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml @@ -20,3 +20,7 @@ Response.Body = '{}' # I'm adding 405 because that's what this test originally do. It's somewhat # surprising though that CLI can receive 405 and that does not result in error anywhere. Response.StatusCode = 405 + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 1637965310c..909e8d6c705 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index 7fbdd0e6776..c495bdcb071 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml new file mode 100644 index 00000000000..2a2e9c20339 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt deleted file mode 100644 index e86795abf5d..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt deleted file mode 100644 index ee47fabbb63..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt deleted file mode 100644 index 5957e33b910..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt +++ /dev/null @@ -1,12 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt deleted file mode 100644 index ac2e13efb95..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt +++ /dev/null @@ -1,11 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 89be65f1950..99e7fbb699a 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,4 +1,20 @@ +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index 0a9d57a1a43..f7257769ac1 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t dev -trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t prod trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-mode/test.toml b/acceptance/bundle/telemetry/deploy-mode/test.toml new file mode 100644 index 00000000000..2a2e9c20339 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt deleted file mode 100644 index 0e133547de1..00000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt deleted file mode 100644 index 65960fa86d5..00000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 9c59c430234..31581169f2c 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 6e9d2f7378c..3022a2b5e49 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/test.toml b/acceptance/bundle/telemetry/deploy-target-count/test.toml new file mode 100644 index 00000000000..2a2e9c20339 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index e8580d71b39..be4840e69ef 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index caaf8c1f39f..dad762899a2 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/test.toml b/acceptance/bundle/telemetry/deploy-variable-count/test.toml index 855ecdd39ee..0a40c794b3a 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/test.toml +++ b/acceptance/bundle/telemetry/deploy-variable-count/test.toml @@ -14,3 +14,7 @@ Response.Body = ''' ] } ''' + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt deleted file mode 100644 index f8db617c003..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt deleted file mode 100644 index 048d0f07b50..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt deleted file mode 100644 index b786de11fed..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt deleted file mode 100644 index 651d315f77c..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index ed89628d989..a9b8ce4ae6e 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,4 +1,18 @@ +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 5bc513afb87..078fa94cdd3 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml index 0d481507067..317e12a834d 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml @@ -6,3 +6,7 @@ Ignore = [ '.databricks', "__pycache__", ] + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index 0c061fbe312..b35859d86a9 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,3 +20,8 @@ Validation OK! "." ] } + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index 485556d28a6..d2aae85444a 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/bundle/validate/sync_patterns/test.toml b/acceptance/bundle/validate/sync_patterns/test.toml index 159efe02696..abc1014fd61 100644 --- a/acceptance/bundle/validate/sync_patterns/test.toml +++ b/acceptance/bundle/validate/sync_patterns/test.toml @@ -1 +1,5 @@ RecordRequests = true + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt deleted file mode 100644 index 945da6d1443..00000000000 --- a/acceptance/cache/simple/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt deleted file mode 100644 index 41cfbc2a2d3..00000000000 --- a/acceptance/cache/simple/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index 524c077f460..a2907174bf3 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -p dogfood trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 75759db680d..f791f9a03c0 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,9 +3,6 @@ Local = true RecordRequests = true -# Enable engine-specific output files -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" - # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' @@ -14,3 +11,7 @@ New = '' [[Repls]] Old = ' mutator=[A-Za-z]+' New = '' + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index dfa89ef7486..2f1b6712a2f 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,7 +183,6 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - // Track remaining kill counts and offset counts per pattern (for KillCaller > 0) killCounters := make(map[string]int) offsetCounters := make(map[string]int) killCountersMu := &sync.Mutex{} @@ -196,7 +195,6 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - // Initialize kill counter and offset counter for this pattern if stub.KillCaller > 0 { killCounters[stub.Pattern] = stub.KillCaller offsetCounters[stub.Pattern] = stub.KillCallerOffset @@ -241,7 +239,6 @@ func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]i mu.Lock() defer mu.Unlock() - // Still in offset period? Let this request pass. if offsetCounters[stub.Pattern] > 0 { offsetCounters[stub.Pattern]-- return false diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index aec6e7cc523..1b686519c6d 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,7 +21,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - // Still need to finalize if WAL recovery happened to commit the recovered state if b.StateDB.RecoveredFromWAL() { if err := b.StateDB.Finalize(); err != nil { logdiag.LogError(ctx, err) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index a54da010f17..3f5a5c4f50c 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -104,8 +104,6 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } -// ensureWALOpen opens the WAL file and writes the header if not already done. -// Must be called while holding db.mu. func (db *DeploymentState) ensureWALOpen() error { if db.wal != nil { return nil @@ -122,7 +120,6 @@ func (db *DeploymentState) ensureWALOpen() error { db.Data.Lineage = lineage } - // WAL serial is the NEXT serial (current + 1) walSerial := db.Data.Serial + 1 if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { @@ -169,15 +166,8 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { data, err := os.ReadFile(path) if err != nil { if errors.Is(err, fs.ErrNotExist) { - // Create new database with serial=0, will be incremented to 1 in Finalize() db.Data = NewDatabase("", 0) db.Path = path - - // Write state file immediately to ensure it exists before any WAL operations. - // This guarantees we have a base state file for recovery validation. - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return fmt.Errorf("failed to create state directory: %w", err) - } if err := db.unlockedSave(); err != nil { return err } @@ -196,6 +186,11 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { if err != nil { return fmt.Errorf("WAL recovery failed: %w", err) } + + if err := migrateState(&db.Data); err != nil { + return fmt.Errorf("migrating state %s: %w", path, err) + } + if recovered { if err := db.unlockedSave(); err != nil { return err @@ -205,10 +200,6 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { } db.recoveredFromWAL = true } - - if err := migrateState(&db.Data); err != nil { - return fmt.Errorf("migrating state %s: %w", path, err) - } return nil } @@ -223,7 +214,8 @@ func (db *DeploymentState) Finalize() error { } db.wal = nil - replayResult, err := replayWAL(db.Path, &db.Data) + validationDB := db.Data + replayResult, err := replayWAL(db.Path, &validationDB) if err != nil { return fmt.Errorf("failed to replay WAL during finalize: %w", err) } @@ -259,8 +251,7 @@ func (db *DeploymentState) Finalize() error { return nil } -// Close closes the WAL file handle without finalizing or truncating. -// Use this in tests or when you need to abort without saving state. +// Close closes the WAL file without saving state. func (db *DeploymentState) Close() error { db.mu.Lock() defer db.mu.Unlock() @@ -280,8 +271,7 @@ func (db *DeploymentState) AssertOpened() { } } -// RecoveredFromWAL returns true if state was recovered from WAL during Open(). -// This is used to determine if Finalize() should be called even with an empty plan. +// RecoveredFromWAL reports whether Open recovered state from the WAL. func (db *DeploymentState) RecoveredFromWAL() bool { return db.recoveredFromWAL } @@ -316,7 +306,6 @@ func (db *DeploymentState) unlockedSave() error { return err } - // Create parent directories if they don't exist dir := filepath.Dir(db.Path) if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("failed to create directory %#v: %w", dir, err) diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index cd422c37dff..25bb3feaeae 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -36,7 +36,6 @@ type corruptedWALEntry struct { } type walReplayResult struct { - hasWAL bool recovered bool stale bool entriesRecovered int @@ -198,100 +197,46 @@ func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, err func replayWAL(statePath string, db *Database) (walReplayResult, error) { result := walReplayResult{} - wp := walPath(statePath) - - if _, err := os.Stat(wp); os.IsNotExist(err) { - return result, nil - } - result.hasWAL = true - - f, err := os.Open(wp) + header, entries, corrupted, err := readWAL(statePath) if err != nil { + if os.IsNotExist(err) { + return result, nil + } return result, fmt.Errorf("%w: %v", errWALRead, err) } - defer f.Close() - - scanner := bufio.NewScanner(f) - scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) - var header *WALHeader - lineNumber := 0 - var corrupted []corruptedWALEntry - for scanner.Scan() { - lineNumber++ - line := bytes.TrimSpace(scanner.Bytes()) - if len(line) == 0 { - continue - } - - lineCopy := make([]byte, len(line)) - copy(lineCopy, line) - if header == nil { - var h WALHeader - if err := json.Unmarshal(lineCopy, &h); err != nil { - return result, fmt.Errorf("%w: failed to parse WAL header: %w", errWALRead, err) - } - header = &h - - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - result.stale = true - return result, nil - } - - if header.Serial > expectedSerial { - return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) - } - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) - } + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + result.stale = true + return result, nil + } - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage - } + if header.Serial > expectedSerial { + return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } - continue - } + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } - var entry WALEntry - if err := json.Unmarshal(lineCopy, &entry); err != nil { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: err, - }) - continue - } + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } - if entry.K == "" { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: errors.New("entry has empty key"), - }) - continue - } + if db.State == nil { + db.State = make(map[string]ResourceEntry) + } + for _, entry := range entries { if entry.V != nil { db.State[entry.K] = *entry.V } else { delete(db.State, entry.K) } - result.entriesRecovered++ - } - - if err := scanner.Err(); err != nil { - return result, fmt.Errorf("%w: failed to read WAL file: %w", errWALRead, err) - } - - if header == nil { - return result, fmt.Errorf("%w: WAL file is empty", errWALRead) } result.recovered = true + result.entriesRecovered = len(entries) result.corruptedEntries = corrupted return result, nil } @@ -324,7 +269,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) walLogPath := relativePathForLog(walPath(statePath)) for _, corrupted := range replayResult.corruptedEntries { - log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) + log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d", walLogPath, corrupted.lineNumber) } if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index 9d4533eba73..d8a5f233452 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -1,7 +1,6 @@ package dstate import ( - "context" "encoding/json" "os" "path/filepath" @@ -98,7 +97,7 @@ func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { } func TestRecoverFromWAL_NoWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -109,7 +108,7 @@ func TestRecoverFromWAL_NoWAL(t *testing.T) { } func TestRecoverFromWAL_ValidWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -139,7 +138,7 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { } func TestRecoverFromWAL_StaleWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -162,7 +161,7 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { } func TestRecoverFromWAL_FutureWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -181,7 +180,7 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { } func TestRecoverFromWAL_LineageMismatch(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -200,7 +199,7 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { } func TestRecoverFromWAL_DeleteOperation(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -232,7 +231,7 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { } func TestDeploymentState_WALIntegration(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -270,7 +269,7 @@ func TestDeploymentState_WALIntegration(t *testing.T) { } func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -307,7 +306,7 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { } func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -340,7 +339,7 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { } func TestDeploymentState_WALWithDependsOn(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -366,7 +365,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { } func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -398,7 +397,7 @@ not valid json } func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -430,7 +429,7 @@ not valid json } func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -457,7 +456,7 @@ func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { } func TestRecoverFromWAL_LineageAdoption(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -489,7 +488,7 @@ func TestReadWAL_EmptyFile(t *testing.T) { } func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -522,7 +521,7 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { } func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) From 784f7c569ea92a869bb8a4303594b5e8ba4234f7 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Thu, 26 Mar 2026 20:59:25 +0530 Subject: [PATCH 09/80] Update WAL corrupted entry outputs --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 2 +- acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f7ebf7bfd25..ee28d6391e4 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -10,7 +10,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4 Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Recovered 2 entries from WAL file. Deploying resources... diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index bf9236c1f93..ffc7ef7d04d 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -10,7 +10,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3 Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Recovered 2 entries from WAL file. Deploying resources... From 02412321c505b6325d6d0c29f7e5741de3e1bb51 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 17:11:26 +0100 Subject: [PATCH 10/80] WIP --- bundle/direct/bind.go | 6 +- bundle/direct/bundle_apply.go | 6 +- bundle/direct/dstate/state.go | 197 +++++++++++++++------------------- bundle/direct/dstate/wal.go | 12 +-- 4 files changed, 89 insertions(+), 132 deletions(-) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 08d849d14c1..7f11a8674d5 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -96,7 +96,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize() + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -138,7 +138,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize() + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -216,5 +216,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize() + return b.StateDB.Finalize(ctx) } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 1b686519c6d..a7f3ee65fc2 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,11 +21,7 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - if b.StateDB.RecoveredFromWAL() { - if err := b.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } + // Avoid creating state file if nothing to deploy return } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 3f5a5c4f50c..cfa7ec21143 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -3,7 +3,6 @@ package dstate import ( "context" "encoding/json" - "errors" "fmt" "io/fs" "os" @@ -20,11 +19,10 @@ import ( const currentStateVersion = 2 type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - wal *WAL - recoveredFromWAL bool + Path string + Data Database + mu sync.Mutex + walFile *os.File } type Database struct { @@ -41,6 +39,18 @@ type ResourceEntry struct { DependsOn []deployplan.DependsOnEntry `json:"depends_on,omitempty"` } +type WALHeader struct { + Lineage string `json:"lineage"` + Serial int `json:"serial"` + StateVersion int `json:"state_version"` + CLIVersion string `json:"cli_version"` +} + +type WALEntry struct { + K string `json:"k"` + V *ResourceEntry `json:"v,omitempty"` // nil means delete +} + func NewDatabase(lineage string, serial int) Database { return Database{ StateVersion: currentStateVersion, @@ -52,7 +62,7 @@ func NewDatabase(lineage string, serial int) Database { } func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []deployplan.DependsOnEntry) error { - db.AssertOpened() + db.AssertOpenedForWrite() db.mu.Lock() defer db.mu.Unlock() @@ -60,7 +70,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } - jsonMessage, err := json.MarshalIndent(state, "", " ") + jsonMessage, err := json.Marshal(state) if err != nil { return err } @@ -71,20 +81,12 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - if err := db.ensureWALOpen(); err != nil { - return fmt.Errorf("failed to open WAL: %w", err) - } - if err := db.wal.writeJSON(WALEntry{K: key, V: &entry}); err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - db.Data.State[key] = entry - - return nil + return appendJSONLine(db.walFile, WALEntry{K: key, V: &entry}) } func (db *DeploymentState) DeleteState(key string) error { - db.AssertOpened() + db.AssertOpenedForWrite() db.mu.Lock() defer db.mu.Unlock() @@ -92,43 +94,8 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - if err := db.ensureWALOpen(); err != nil { - return fmt.Errorf("failed to open WAL: %w", err) - } - if err := db.wal.writeJSON(WALEntry{K: key}); err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - delete(db.Data.State, key) - - return nil -} - -func (db *DeploymentState) ensureWALOpen() error { - if db.wal != nil { - return nil - } - - wal, err := openWAL(db.Path) - if err != nil { - return err - } - - lineage := db.Data.Lineage - if lineage == "" { - lineage = uuid.New().String() - db.Data.Lineage = lineage - } - - walSerial := db.Data.Serial + 1 - - if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { - wal.close() - return err - } - - db.wal = wal - return nil + return appendJSONLine(db.walFile, WALEntry{K: key}) } func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { @@ -155,7 +122,12 @@ func (db *DeploymentState) GetResourceID(key string) string { return entry.ID } -func (db *DeploymentState) Open(ctx context.Context, path string) error { +type ( + WithRecovery bool + WithWrite bool +) + +func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery WithRecovery, withWrite WithWrite) error { db.mu.Lock() defer db.mu.Unlock() @@ -166,11 +138,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { data, err := os.ReadFile(path) if err != nil { if errors.Is(err, fs.ErrNotExist) { + // Not initializing lineage yet, we might have that saved in WAL db.Data = NewDatabase("", 0) db.Path = path - if err := db.unlockedSave(); err != nil { - return err - } } else { return err } @@ -182,73 +152,60 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { db.Path = path } - recovered, err := recoverFromWAL(ctx, path, &db.Data) - if err != nil { - return fmt.Errorf("WAL recovery failed: %w", err) + walPath := walPath(db.Path) + _, walError := os.Stat(walPath) + if walError == nil { + if withRecovery { + err := db.mergeWalIntoState(ctx) + if err != nil { + return err + } + } else { + return fmt.Errorf("unprocessed WAL exists: %s", walPath) + } } if err := migrateState(&db.Data); err != nil { return fmt.Errorf("migrating state %s: %w", path, err) } - if recovered { - if err := db.unlockedSave(); err != nil { - return err + if withWrite { + db.walFile, err = os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + if err != nil { + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } - if err := cleanupWAL(path); err != nil { - return err + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() } - db.recoveredFromWAL = true - } - return nil -} - -func (db *DeploymentState) Finalize() error { - db.mu.Lock() - defer db.mu.Unlock() - - hadOpenWAL := db.wal != nil - if hadOpenWAL { - if err := db.wal.close(); err != nil { - return err + // Set our Serial to the next one + db.Data.Serial += 1 + walHead := WALHeader{ + Lineage: lineage, + Serial: db.Data.Serial, // next serial + StateVersion: currentStateVersion, + CLIVersion: build.GetInfo().Version, } - db.wal = nil - - validationDB := db.Data - replayResult, err := replayWAL(db.Path, &validationDB) + err := appendJSONLine(db.walFile, walHead) if err != nil { - return fmt.Errorf("failed to replay WAL during finalize: %w", err) - } - if !replayResult.recovered { - return errors.New("failed to replay WAL during finalize: WAL file not found or stale") - } - if len(replayResult.corruptedEntries) > 0 { - first := replayResult.corruptedEntries[0] - return fmt.Errorf("failed to replay WAL during finalize: corrupted entry at line %d: %v", first.lineNumber, first.parseErr) + return err } } - if db.Data.Lineage == "" && !hadOpenWAL && len(db.Data.State) == 0 { - return nil - } - - if db.Data.Lineage == "" { - db.Data.Lineage = uuid.New().String() - } - - db.Data.Serial++ + return nil +} - if err := db.unlockedSave(); err != nil { - return err - } +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +} - if hadOpenWAL { - if err := cleanupWAL(db.Path); err != nil { - return err - } - } +func (db *DeploymentState) Finalize(ctx context.Context) error { + db.AssertOpenedForWrite() + db.mu.Lock() + defer db.mu.Unlock() - return nil + db.walFile.Close() + db.walFile = nil + return db.mergeWalIntoState(ctx) } // Close closes the WAL file without saving state. @@ -271,9 +228,11 @@ func (db *DeploymentState) AssertOpened() { } } -// RecoveredFromWAL reports whether Open recovered state from the WAL. -func (db *DeploymentState) RecoveredFromWAL() bool { - return db.recoveredFromWAL +func (db *DeploymentState) AssertOpenedForWrite() { + db.AssertOpened() + if db.walFile == nil { + panic("internal error: DeploymentState must be opened in write mode") + } } func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { @@ -300,7 +259,7 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export } func (db *DeploymentState) unlockedSave() error { - db.AssertOpened() + db.AssertOpenedForWrite() data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { return err @@ -318,3 +277,15 @@ func (db *DeploymentState) unlockedSave() error { return nil } + +func appendJSONLine(file *os.File, obj any) error { + data, err := json.Marshal(obj) + if err != nil { + return err + } + data = append(data, '\n') + + _, err = file.Write(data) + // no fsync here, not needed + return err +} diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 25bb3feaeae..9ccb12303d2 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -15,16 +15,6 @@ import ( "github.com/databricks/cli/libs/log" ) -type WALHeader struct { - Lineage string `json:"lineage"` - Serial int `json:"serial"` -} - -type WALEntry struct { - K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` // nil means delete -} - type WAL struct { file *os.File } @@ -255,7 +245,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, } if replayResult.stale { - log.Debugf(ctx, "Deleting stale WAL (serial behind current state)") + log.Warnf(ctx, "Deleting stale WAL (serial=%s behind current state serial=)") if err := cleanupWAL(statePath); err != nil { return false, err } From 8b186314b162f28d023d9db42fe9bb712651322c Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 21:32:47 +0530 Subject: [PATCH 11/80] Updated tests and enhanced kill caller with an offset Signed-off-by: Varun Deep Saini --- acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml | 1 + acceptance/bundle/deploy/wal/multiple-crashes/test.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 6245c198409..9c9ab5a30bd 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,3 +11,4 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' + diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index 474177b8046..c5981d67208 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,3 +16,4 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + From 0dd57abc37c46ca4389097d95ab5f24e4d573d4e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 24 Jan 2026 00:51:38 +0530 Subject: [PATCH 12/80] Updated existing tests Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 ++++++ .../out.deploy.terraform.txt | 6 +++++ .../output.txt | 6 ----- .../script | 2 +- .../test.toml | 1 + .../build_and_files_whl/out.deploy.direct.txt | 8 +++++++ .../out.deploy.terraform.txt | 7 ++++++ .../artifacts/build_and_files_whl/output.txt | 7 ------ .../artifacts/build_and_files_whl/script | 2 +- .../shell/bash/out.deploy.direct.txt | 7 ++++++ .../shell/bash/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/bash/output.txt | 5 ---- acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ++++++ .../shell/basic/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/basic/output.txt | 5 ---- .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ++++++ .../shell/default/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/default/output.txt | 5 ---- .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ++++++ .../shell/sh/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/sh/output.txt | 5 ---- acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 +++++ .../empty-bundle/out.deploy.terraform.txt | 5 ++++ .../bundle/deploy/empty-bundle/output.txt | 5 ---- acceptance/bundle/deploy/empty-bundle/script | 2 +- .../deploy/wal/corrupted-wal-entry/test.toml | 1 - .../deploy/wal/multiple-crashes/test.toml | 1 - .../bundle/scripts/out.deploy.direct.txt | 24 +++++++++++++++++++ .../bundle/scripts/out.deploy.terraform.txt | 23 ++++++++++++++++++ acceptance/bundle/scripts/output.txt | 23 ------------------ .../out.deploy.direct.txt | 18 ++++++++++++++ .../out.deploy.terraform.txt | 17 +++++++++++++ .../scripts/restricted-execution/output.txt | 17 ------------- .../scripts/restricted-execution/script | 2 +- acceptance/bundle/scripts/script | 2 +- .../out.deploy-one.direct.txt | 6 +++++ .../out.deploy-one.terraform.txt | 5 ++++ .../out.deploy-two.direct.txt | 6 +++++ .../out.deploy-two.terraform.txt | 5 ++++ .../deploy-artifact-path-type/output.txt | 10 -------- .../deploy-artifact-path-type/script | 4 ++-- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-config-file-count/output.txt | 5 ---- .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-mode/out.deploy-dev.direct.txt | 6 +++++ .../deploy-mode/out.deploy-dev.terraform.txt | 5 ++++ .../deploy-mode/out.deploy-prod.direct.txt | 12 ++++++++++ .../deploy-mode/out.deploy-prod.terraform.txt | 11 +++++++++ .../bundle/telemetry/deploy-mode/output.txt | 16 ------------- .../bundle/telemetry/deploy-mode/script | 4 ++-- .../deploy-target-count/out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../telemetry/deploy-target-count/output.txt | 5 ---- .../telemetry/deploy-target-count/script | 2 +- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-variable-count/output.txt | 5 ---- .../telemetry/deploy-variable-count/script | 2 +- .../out.deploy-one.direct.txt | 8 +++++++ .../out.deploy-one.terraform.txt | 7 ++++++ .../out.deploy-two.direct.txt | 8 +++++++ .../out.deploy-two.terraform.txt | 7 ++++++ .../telemetry/deploy-whl-artifacts/output.txt | 14 ----------- .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../sync_patterns/out.deploy.direct.txt | 6 +++++ .../sync_patterns/out.deploy.terraform.txt | 5 ++++ .../bundle/validate/sync_patterns/output.txt | 5 ---- .../bundle/validate/sync_patterns/script | 2 +- acceptance/cache/simple/out.deploy.direct.txt | 6 +++++ .../cache/simple/out.deploy.terraform.txt | 5 ++++ acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 3 +++ 77 files changed, 338 insertions(+), 160 deletions(-) create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt create mode 100644 acceptance/cache/simple/out.deploy.direct.txt create mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt new file mode 100644 index 00000000000..f75a5428b16 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt new file mode 100644 index 00000000000..8ec9c52db62 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6d24880e6c0..6c8bd962a56 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,10 +1,4 @@ ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index 883601185c9..fba3a777006 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index 8185d0df6e5..67a9da6c977 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,4 +1,5 @@ RecordRequests = true +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt new file mode 100644 index 00000000000..4039d5917e8 --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt @@ -0,0 +1,8 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt new file mode 100644 index 00000000000..9894e5b89ff --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt @@ -0,0 +1,7 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index b618de6b89a..d44a21b582a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,10 +7,3 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 2d7d63f7fec..9aa0d870e7a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt new file mode 100644 index 00000000000..f311959abdd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt new file mode 100644 index 00000000000..fa5d7b76bcd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index fa5d7b76bcd..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt new file mode 100644 index 00000000000..3a4ff9138ba --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt new file mode 100644 index 00000000000..b5e01c79e67 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index b5e01c79e67..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt new file mode 100644 index 00000000000..f311959abdd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt new file mode 100644 index 00000000000..fa5d7b76bcd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index fa5d7b76bcd..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt new file mode 100644 index 00000000000..98820986f53 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt new file mode 100644 index 00000000000..5117e6e9fc0 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 5117e6e9fc0..8b137891791 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 68ebb78d775..09bb41643ca 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt new file mode 100644 index 00000000000..81dddfcb9fc --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt new file mode 100644 index 00000000000..494f76c84fa --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 919accb661f..8498653a6e7 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! - >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index 775ccd0defc..b74818f1b1a 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 9c9ab5a30bd..6245c198409 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' - diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index c5981d67208..474177b8046 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,4 +16,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt new file mode 100644 index 00000000000..037f609f944 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.direct.txt @@ -0,0 +1,24 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt new file mode 100644 index 00000000000..a3d9ba342c2 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.terraform.txt @@ -0,0 +1,23 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index 68afb2feccb..a39a0b0aa9b 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,26 +25,3 @@ Name: scripts Found 1 error Exit code: 1 - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt new file mode 100644 index 00000000000..d8fed9e4e6c --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt @@ -0,0 +1,18 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt new file mode 100644 index 00000000000..efcf1281cb7 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt @@ -0,0 +1,17 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index f377edba7cb..2186ac68f02 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,22 +1,5 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env - === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 7a3dcb068b4..2e31cce2eea 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index de07d277ea9..3acb85f9cd1 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy +trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt new file mode 100644 index 00000000000..0e133547de1 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt new file mode 100644 index 00000000000..65960fa86d5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt new file mode 100644 index 00000000000..120e5902015 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt new file mode 100644 index 00000000000..fabdebb399f --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index a03920c3fdc..69c6730b46a 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,14 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index d1a63928a67..4f3bd7c3cf4 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 909e8d6c705..1637965310c 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index c495bdcb071..7fbdd0e6776 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt new file mode 100644 index 00000000000..e86795abf5d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt new file mode 100644 index 00000000000..ee47fabbb63 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt new file mode 100644 index 00000000000..5957e33b910 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt new file mode 100644 index 00000000000..ac2e13efb95 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt @@ -0,0 +1,11 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 99e7fbb699a..89be65f1950 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,20 +1,4 @@ ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index f7257769ac1..0a9d57a1a43 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev +trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t prod +trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt new file mode 100644 index 00000000000..0e133547de1 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..65960fa86d5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 31581169f2c..9c59c430234 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 3022a2b5e49..6e9d2f7378c 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index be4840e69ef..e8580d71b39 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index dad762899a2..caaf8c1f39f 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt new file mode 100644 index 00000000000..f8db617c003 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt new file mode 100644 index 00000000000..048d0f07b50 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt new file mode 100644 index 00000000000..b786de11fed --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt new file mode 100644 index 00000000000..651d315f77c --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index a9b8ce4ae6e..ed89628d989 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,18 +1,4 @@ ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 078fa94cdd3..5bc513afb87 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt new file mode 100644 index 00000000000..1b73d1b9169 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt new file mode 100644 index 00000000000..5c6aad5b37b --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index b35859d86a9..0c061fbe312 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,8 +20,3 @@ Validation OK! "." ] } - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index d2aae85444a..485556d28a6 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt new file mode 100644 index 00000000000..945da6d1443 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt new file mode 100644 index 00000000000..41cfbc2a2d3 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index a2907174bf3..524c077f460 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood +trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index f791f9a03c0..2601c79f825 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,6 +3,9 @@ Local = true RecordRequests = true +# Enable engine-specific output files +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" + # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' From 2029281974bdc03db27f368bc96fc323bc13858b Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 21:43:22 +0100 Subject: [PATCH 13/80] Merge simplified WAL handling into state.go fix Open() calls; replace Finalize() with Close(); close state file in plan --- bundle/direct/bind.go | 12 +- bundle/direct/bundle_apply.go | 1 - bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 243 ++++++++++++++++++++-------- bundle/direct/dstate/wal.go | 289 ---------------------------------- cmd/bundle/utils/process.go | 2 +- 6 files changed, 189 insertions(+), 360 deletions(-) delete mode 100644 bundle/direct/dstate/wal.go diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 7f11a8674d5..74389313af4 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,7 +62,7 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(ctx, statePath); err == nil { + if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)); err == nil { if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -82,7 +82,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Open temp state - err := b.StateDB.Open(ctx, tmpStatePath) + err := b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(true)) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -96,7 +96,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize(ctx) + err = b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -138,7 +138,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize(ctx) + err = b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -188,7 +188,7 @@ func (result *BindResult) Cancel() { // Unbind removes a resource from direct engine state without deleting // the workspace resource. Also removes associated permissions/grants entries. func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey string) error { - err := b.StateDB.Open(ctx, statePath) + err := b.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)) if err != nil { return err } @@ -216,5 +216,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize(ctx) + return b.StateDB.Close(ctx) } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index a7f3ee65fc2..7a77968515f 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -25,7 +25,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return } - b.StateDB.AssertOpened() b.RemoteStateCache.Clear() g, err := makeGraph(plan) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 1fb70123b96..03fe2b87bb2 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func OpenStateWithPlanCheck(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index cfa7ec21143..e409c3f6e86 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -1,6 +1,7 @@ package dstate import ( + "bufio" "context" "encoding/json" "fmt" @@ -16,13 +17,19 @@ import ( "github.com/google/uuid" ) -const currentStateVersion = 2 +const ( + currentStateVersion = 2 + initialBufferSize = 64 * 1024 + maxWalEntrySize = 1024 * 1024 + walSuffix = ".WAL" +) type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - walFile *os.File + Path string + Data Database + mu sync.Mutex + walFile *os.File + stateIDs map[string]string } type Database struct { @@ -47,8 +54,8 @@ type WALHeader struct { } type WALEntry struct { - K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` // nil means delete + Key string `json:"k"` + Value *ResourceEntry `json:"v,omitempty"` // nil means delete } func NewDatabase(lineage string, serial int) Database { @@ -70,6 +77,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } + // don't indent so that every WAL entry remains on a single line jsonMessage, err := json.Marshal(state) if err != nil { return err @@ -81,8 +89,11 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - db.Data.State[key] = entry - return appendJSONLine(db.walFile, WALEntry{K: key, V: &entry}) + err = appendJSONLine(db.walFile, WALEntry{Key: key, Value: &entry}) + if err == nil { + db.stateIDs[key] = newID + } + return err } func (db *DeploymentState) DeleteState(key string) error { @@ -94,12 +105,15 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - delete(db.Data.State, key) - return appendJSONLine(db.walFile, WALEntry{K: key}) + err := appendJSONLine(db.walFile, WALEntry{Key: key}) + if err == nil { + delete(db.stateIDs, key) + } + return err } -func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { - db.AssertOpened() +func (db *DeploymentState) GetResourceEntry(key string) (ResourceEntry, bool) { + db.AssertOpenedForRead() db.mu.Lock() defer db.mu.Unlock() @@ -111,14 +125,28 @@ func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { return result, ok } -// GetResourceEntry returns the full resource entry for the given key. -func (db *DeploymentState) GetResourceEntry(key string) (ResourceEntry, bool) { - return db.getResourceEntry(key) -} - // GetResourceID returns the ID of the resource for the given key, or an empty string if not found. func (db *DeploymentState) GetResourceID(key string) string { - entry, _ := db.getResourceEntry(key) + db.AssertOpenedForReadOrWrite() + db.mu.Lock() + defer db.mu.Unlock() + + if db.walFile != nil { + // in write-mode new IDs are written to WAL and stored in this map + id := db.stateIDs[key] + if id != "" { + return id + } + } + + // in read mode State is the source of IDs for all requests + // in write mode State is the source of IDs for all resources that were not updated + + if db.Data.State == nil { + return "" + } + + entry, _ := db.Data.State[key] return entry.ID } @@ -135,33 +163,19 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W panic(fmt.Sprintf("state already opened: %v, cannot open %v", db.Path, path)) } - data, err := os.ReadFile(path) - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - // Not initializing lineage yet, we might have that saved in WAL - db.Data = NewDatabase("", 0) - db.Path = path - } else { - return err - } - } else { - err = json.Unmarshal(data, &db.Data) - if err != nil { - return err - } - db.Path = path - } + db.Path = path + db.Reload(ctx) - walPath := walPath(db.Path) + walPath := db.Path + walSuffix _, walError := os.Stat(walPath) if walError == nil { if withRecovery { - err := db.mergeWalIntoState(ctx) + err := db.replayWAL(ctx) if err != nil { return err } } else { - return fmt.Errorf("unprocessed WAL exists: %s", walPath) + return fmt.Errorf("Unexpected WAL file found at %s", walPath) } } @@ -170,66 +184,171 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } if withWrite { - db.walFile, err = os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } + db.walFile = walFile lineage := db.Data.Lineage if lineage == "" { + // state file is new, does not have lineage yet; store lineage in the WAL only lineage = uuid.New().String() } - // Set our Serial to the next one - db.Data.Serial += 1 walHead := WALHeader{ Lineage: lineage, - Serial: db.Data.Serial, // next serial + Serial: db.Data.Serial + 1, StateVersion: currentStateVersion, CLIVersion: build.GetInfo().Version, } - err := appendJSONLine(db.walFile, walHead) - if err != nil { + return appendJSONLine(db.walFile, walHead) + } + + return nil +} + +func (db *DeploymentState) Reload(ctx context.Context) error { + + + data, err := os.ReadFile(db.Path) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + // Not initializing lineage yet, we might have that saved in WAL + db.Data = NewDatabase("", 0) + } else { return err } + } else { + return json.Unmarshal(data, &db.Data) } + return nil +} +func (db *DeploymentState) replayWAL(ctx context.Context) error { + walPath := db.Path + walSuffix + hasUpdates, err := db.mergeWalIntoState(ctx) + if err != nil { + return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) + } + if hasUpdates { + if err := db.unlockedSave(); err != nil { + return err + } + } + err = os.Remove(walPath) + if err != nil { + return fmt.Errorf("failed to remove WAL file %s: %w", walPath, err) + } return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHeader) error { + if header.CLIVersion != db.Data.CLIVersion { + return fmt.Errorf("cli_version in the header (%q) does not match the one in the state (%q)", header.CLIVersion, db.Data.CLIVersion) + } + + if header.StateVersion != db.Data.StateVersion { + return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) + } + + if header.Lineage != db.Data.Lineage { + return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) + } + + if header.Serial != db.Data.Serial+1 { + return fmt.Errorf("serial in the header (%q) is not one higher than the one in the state (%q)", header.Serial, db.Data.Serial) + } + + return nil } -func (db *DeploymentState) Finalize(ctx context.Context) error { - db.AssertOpenedForWrite() - db.mu.Lock() - defer db.mu.Unlock() +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { + if db.walFile != nil { + panic("internal error: walFile must be closed") + } + + hasUpdates := false + walPath := db.Path + walSuffix + walFile, err := os.Open(walPath) + if err != nil { + return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + } + defer walFile.Close() + + scanner := bufio.NewScanner(walFile) + scanner.Buffer(make([]byte, 0, initialBufferSize), maxWalEntrySize) + lineNumber := 0 + + for scanner.Scan() { + lineNumber += 1 + line := scanner.Bytes() + if lineNumber == 1 { + var header WALHeader + if err := json.Unmarshal(line, &header); err != nil { + return hasUpdates, fmt.Errorf("failed to parse WAL header: %w", err) + } + if err := db.validateWALHeader(ctx, &header); err != nil { + return hasUpdates, err + } + } else { + var entry WALEntry + if err := json.Unmarshal(line, &entry); err != nil { + return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%s: %q: %w", walPath, lineNumber, entry, err) + } + hasUpdates = true + if entry.Value == nil { + delete(db.Data.State, entry.Key) + } else { + db.Data.State[entry.Key] = *entry.Value + } + } + } + + if err := scanner.Err(); err != nil { + return hasUpdates, err + } - db.walFile.Close() - db.walFile = nil - return db.mergeWalIntoState(ctx) + if hasUpdates { + // only assume WAL file's serial if we read any data from it + db.Data.Serial += 1 + } + + return hasUpdates, nil } -// Close closes the WAL file without saving state. -func (db *DeploymentState) Close() error { +func (db *DeploymentState) Close(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() - if db.wal != nil { - if err := db.wal.close(); err != nil { - return err - } - db.wal = nil + var err error + + if db.walFile != nil { + db.walFile.Close() + db.walFile = nil + err = db.replayWAL(ctx) } - return nil + + db.Path = "" + db.Data = Database{} + db.stateIDs = make(map[string]string) + + return err } -func (db *DeploymentState) AssertOpened() { +func (db *DeploymentState) AssertOpenedForReadOrWrite() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") } } +func (db *DeploymentState) AssertOpenedForRead() { + db.AssertOpenedForReadOrWrite() + if db.walFile != nil { + panic("internal error: DeploymentState must be opened in read mode") + } +} + func (db *DeploymentState) AssertOpenedForWrite() { - db.AssertOpened() + db.AssertOpenedForReadOrWrite() if db.walFile == nil { panic("internal error: DeploymentState must be opened in write mode") } diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go deleted file mode 100644 index 9ccb12303d2..00000000000 --- a/bundle/direct/dstate/wal.go +++ /dev/null @@ -1,289 +0,0 @@ -package dstate - -import ( - "bufio" - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/log" -) - -type WAL struct { - file *os.File -} - -type corruptedWALEntry struct { - lineNumber int - rawLine string - parseErr error -} - -type walReplayResult struct { - recovered bool - stale bool - entriesRecovered int - corruptedEntries []corruptedWALEntry -} - -var errWALRead = errors.New("wal read error") - -func walPath(statePath string) string { - return statePath + ".wal" -} - -func walCorruptedPath(statePath string) string { - return walPath(statePath) + ".corrupted" -} - -func openWAL(statePath string) (*WAL, error) { - wp := walPath(statePath) - f, err := os.OpenFile(wp, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) - if err != nil { - return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) - } - return &WAL{file: f}, nil -} - -func (w *WAL) writeJSON(v any) error { - data, err := json.Marshal(v) - if err != nil { - return fmt.Errorf("failed to marshal WAL entry: %w", err) - } - data = append(data, '\n') - - _, err = w.file.Write(data) - if err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - - if err := w.file.Sync(); err != nil { - return fmt.Errorf("failed to sync WAL entry: %w", err) - } - - return nil -} - -func (w *WAL) close() error { - if w.file != nil { - return w.file.Close() - } - return nil -} - -func cleanupWAL(statePath string) error { - err := os.Remove(walPath(statePath)) - if err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove WAL file %q: %w", walPath(statePath), err) - } - return nil -} - -func moveWALToCorrupted(statePath string) error { - source := walPath(statePath) - target := walCorruptedPath(statePath) - _ = os.Remove(target) - if err := os.Rename(source, target); err != nil { - return fmt.Errorf("failed to move WAL file %q to %q: %w", source, target, err) - } - return nil -} - -func writeCorruptedWALEntries(statePath string, corrupted []corruptedWALEntry) error { - if len(corrupted) == 0 { - return nil - } - - target := walCorruptedPath(statePath) - f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) - if err != nil { - return fmt.Errorf("failed to create corrupted WAL file %q: %w", target, err) - } - defer f.Close() - - for _, entry := range corrupted { - if _, err := f.WriteString(entry.rawLine + "\n"); err != nil { - return fmt.Errorf("failed to write corrupted WAL file %q: %w", target, err) - } - } - - if err := f.Sync(); err != nil { - return fmt.Errorf("failed to sync corrupted WAL file %q: %w", target, err) - } - - return nil -} - -func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, error) { - wp := walPath(statePath) - f, err := os.Open(wp) - if err != nil { - return nil, nil, nil, err - } - defer f.Close() - - scanner := bufio.NewScanner(f) - scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) - var header *WALHeader - var entries []WALEntry - var corrupted []corruptedWALEntry - lineNumber := 0 - for scanner.Scan() { - lineNumber++ - line := bytes.TrimSpace(scanner.Bytes()) - if len(line) == 0 { - continue - } - - lineCopy := make([]byte, len(line)) - copy(lineCopy, line) - if header == nil { - var h WALHeader - if err := json.Unmarshal(lineCopy, &h); err != nil { - return nil, nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) - } - header = &h - continue - } - - var e WALEntry - if err := json.Unmarshal(lineCopy, &e); err != nil { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: err, - }) - continue - } - - if e.K == "" { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: errors.New("entry has empty key"), - }) - continue - } - - entries = append(entries, e) - } - - if err := scanner.Err(); err != nil { - return nil, nil, nil, fmt.Errorf("failed to read WAL file: %w", err) - } - - if header == nil { - return nil, nil, nil, errors.New("WAL file is empty") - } - - return header, entries, corrupted, nil -} - -func replayWAL(statePath string, db *Database) (walReplayResult, error) { - result := walReplayResult{} - header, entries, corrupted, err := readWAL(statePath) - if err != nil { - if os.IsNotExist(err) { - return result, nil - } - return result, fmt.Errorf("%w: %v", errWALRead, err) - } - - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - result.stale = true - return result, nil - } - - if header.Serial > expectedSerial { - return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) - } - - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) - } - - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage - } - - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } - - for _, entry := range entries { - if entry.V != nil { - db.State[entry.K] = *entry.V - } else { - delete(db.State, entry.K) - } - } - - result.recovered = true - result.entriesRecovered = len(entries) - result.corruptedEntries = corrupted - return result, nil -} - -func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { - replayResult, err := replayWAL(statePath, db) - if err != nil { - if errors.Is(err, errWALRead) { - if moveErr := moveWALToCorrupted(statePath); moveErr != nil { - return false, moveErr - } - log.Warnf(ctx, "Failed to read WAL file, moved it to %s and proceeding: %s", relativePathForLog(walCorruptedPath(statePath)), strings.TrimPrefix(err.Error(), errWALRead.Error()+": ")) - return false, nil - } - return false, err - } - - if replayResult.stale { - log.Warnf(ctx, "Deleting stale WAL (serial=%s behind current state serial=)") - if err := cleanupWAL(statePath); err != nil { - return false, err - } - return false, nil - } - - if !replayResult.recovered { - return false, nil - } - - logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) - walLogPath := relativePathForLog(walPath(statePath)) - for _, corrupted := range replayResult.corruptedEntries { - log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d", walLogPath, corrupted.lineNumber) - } - - if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { - return false, err - } - if len(replayResult.corruptedEntries) > 0 { - log.Warnf(ctx, "Saved corrupted WAL entries to %s", relativePathForLog(walCorruptedPath(statePath))) - } - - logRecoveryProgress(ctx, fmt.Sprintf("Recovered %d entries from WAL file.", replayResult.entriesRecovered)) - return true, nil -} - -func relativePathForLog(path string) string { - rel, err := filepath.Rel(".", path) - if err != nil { - return path - } - return filepath.ToSlash(rel) -} - -func logRecoveryProgress(ctx context.Context, message string) { - defer func() { - _ = recover() - }() - cmdio.LogString(ctx, message) -} diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 75081de56e6..54391ec4d60 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.OpenStateWithPlanCheck(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted From f79fa29cb6732862fc77a2cb63020ac4acce9912 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:22:02 +0100 Subject: [PATCH 14/80] fixes --- bundle/direct/dstate/state.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index e409c3f6e86..df237d97cbc 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -209,6 +209,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W func (db *DeploymentState) Reload(ctx context.Context) error { + db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) if err != nil { if errors.Is(err, fs.ErrNotExist) { @@ -250,7 +251,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) } - if header.Lineage != db.Data.Lineage { + if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) } From bb11b78a2b78d346b59c95bbf2a1adef11dd0a12 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:38:05 +0100 Subject: [PATCH 15/80] fixes --- bundle/direct/bundle_apply.go | 1 + bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 10 +- bundle/direct/dstate/wal_test.go | 549 ------------------------------- bundle/phases/deploy.go | 9 + cmd/bundle/utils/process.go | 2 +- 6 files changed, 18 insertions(+), 555 deletions(-) delete mode 100644 bundle/direct/dstate/wal_test.go diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 7a77968515f..6bad8091469 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -25,6 +25,7 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return } + b.StateDB.AssertOpenedForWrite() b.RemoteStateCache.Clear() g, err := makeGraph(plan) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 03fe2b87bb2..1fb70123b96 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func OpenStateWithPlanCheck(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index df237d97cbc..1e882284276 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -146,7 +146,7 @@ func (db *DeploymentState) GetResourceID(key string) string { return "" } - entry, _ := db.Data.State[key] + entry := db.Data.State[key] return entry.ID } @@ -164,7 +164,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } db.Path = path - db.Reload(ctx) + if err := db.Reload(ctx); err != nil { + return err + } walPath := db.Path + walSuffix _, walError := os.Stat(walPath) @@ -175,7 +177,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W return err } } else { - return fmt.Errorf("Unexpected WAL file found at %s", walPath) + return fmt.Errorf("unexpected WAL file found at %s", walPath) } } @@ -293,7 +295,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%s: %q: %w", walPath, lineNumber, entry, err) + return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } hasUpdates = true if entry.Value == nil { diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go deleted file mode 100644 index d8a5f233452..00000000000 --- a/bundle/direct/dstate/wal_test.go +++ /dev/null @@ -1,549 +0,0 @@ -package dstate - -import ( - "encoding/json" - "os" - "path/filepath" - "testing" - - "github.com/databricks/cli/bundle/deployplan" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestWALWriteAndRead(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry1 := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry1}) - require.NoError(t, err) - - entry2 := &ResourceEntry{ - ID: "67890", - State: json.RawMessage(`{"name":"job2"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job2", V: entry2}) - require.NoError(t, err) - - err = wal.writeJSON(WALEntry{K: "resources.jobs.old_job", V: nil}) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - - header, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - assert.Equal(t, "test-lineage", header.Lineage) - assert.Equal(t, 1, header.Serial) - - require.Len(t, entries, 3) - - assert.Equal(t, "resources.jobs.job1", entries[0].K) - require.NotNil(t, entries[0].V) - assert.Equal(t, "12345", entries[0].V.ID) - - assert.Equal(t, "resources.jobs.job2", entries[1].K) - require.NotNil(t, entries[1].V) - assert.Equal(t, "67890", entries[1].V.ID) - - assert.Equal(t, "resources.jobs.old_job", entries[2].K) - assert.Nil(t, entries[2].V) -} - -func TestCleanupWAL(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - err = cleanupWAL(statePath) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) -} - -func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - require.NoError(t, wal.close()) - - _, err = openWAL(statePath) - require.Error(t, err) - assert.Contains(t, err.Error(), "failed to open WAL file") -} - -func TestRecoverFromWAL_NoWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.False(t, recovered) -} - -func TestRecoverFromWAL_ValidWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("", 0) - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.Equal(t, "test-lineage", db.Lineage) - require.Contains(t, db.State, "resources.jobs.job1") - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) -} - -func TestRecoverFromWAL_StaleWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("test-lineage", 2) // serial 2 makes WAL stale - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.False(t, recovered) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) -} - -func TestRecoverFromWAL_FutureWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 5}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("test-lineage", 0) - - _, err = recoverFromWAL(ctx, statePath, &db) - assert.Error(t, err) - assert.Contains(t, err.Error(), "ahead of expected") -} - -func TestRecoverFromWAL_LineageMismatch(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "lineage-A", Serial: 1}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("lineage-B", 0) - - _, err = recoverFromWAL(ctx, statePath, &db) - assert.Error(t, err) - assert.Contains(t, err.Error(), "lineage") -} - -func TestRecoverFromWAL_DeleteOperation(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) - require.NoError(t, err) - - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: nil}) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("", 0) - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.NotContains(t, db.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALIntegration(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - require.NoError(t, err) - - header, entries, _, err := readWAL(statePath) - require.NoError(t, err) - assert.Equal(t, 1, header.Serial) - require.Len(t, entries, 1) - assert.Equal(t, "resources.jobs.job1", entries[0].K) - assert.Equal(t, "12345", entries[0].V.ID) - - err = db.Finalize() - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) - - data, err := os.ReadFile(statePath) - require.NoError(t, err) - var savedDB Database - err = json.Unmarshal(data, &savedDB) - require.NoError(t, err) - assert.Equal(t, 1, savedDB.Serial) - assert.Contains(t, savedDB.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - initialDB := NewDatabase("test-lineage", 5) - initialDB.State["resources.jobs.existing"] = ResourceEntry{ - ID: "existing-id", - State: json.RawMessage(`{"name":"existing"}`), - } - data, err := json.Marshal(initialDB) - require.NoError(t, err) - err = os.WriteFile(statePath, data, 0o600) - require.NoError(t, err) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 6}) - require.NoError(t, err) - entry := &ResourceEntry{ - ID: "new-id", - State: json.RawMessage(`{"name":"new"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.new", V: entry}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - var db DeploymentState - err = db.Open(ctx, statePath) - require.NoError(t, err) - - assert.Contains(t, db.Data.State, "resources.jobs.existing") - assert.Contains(t, db.Data.State, "resources.jobs.new") - assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) -} - -func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - err = db.DeleteState("resources.jobs.job1") - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - require.Len(t, entries, 2) - assert.Equal(t, "resources.jobs.job1", entries[1].K) - assert.Nil(t, entries[1].V) - - err = db.Finalize() - require.NoError(t, err) - - data, err := os.ReadFile(statePath) - require.NoError(t, err) - var savedDB Database - err = json.Unmarshal(data, &savedDB) - require.NoError(t, err) - assert.NotContains(t, savedDB.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALWithDependsOn(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - t.Cleanup(func() { db.Close() }) - - dependsOn := []deployplan.DependsOnEntry{ - {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, - } - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - require.Len(t, entries, 1) - require.NotNil(t, entries[0].V) - require.Len(t, entries[0].V.DependsOn, 1) - assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) -} - -func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"test","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -not valid json -{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - assert.Len(t, db.State, 2) - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) - assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) - - corruptedPath := walCorruptedPath(statePath) - _, err = os.Stat(corruptedPath) - require.NoError(t, err) - contentBytes, err := os.ReadFile(corruptedPath) - require.NoError(t, err) - assert.Equal(t, "not valid json\n", string(contentBytes)) - _, err = os.Stat(walFilePath) - require.NoError(t, err) -} - -func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"test","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} -not valid json -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.Contains(t, db.State, "resources.jobs.job1") - assert.Contains(t, db.State, "resources.jobs.job2") - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) - assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) - - corruptedPath := walCorruptedPath(statePath) - _, err = os.Stat(corruptedPath) - require.NoError(t, err) - contentBytes, err := os.ReadFile(corruptedPath) - require.NoError(t, err) - assert.Equal(t, "not valid json\n", string(contentBytes)) -} - -func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - initialDB := NewDatabase("test-lineage", 0) - data, err := json.Marshal(initialDB) - require.NoError(t, err) - err = os.WriteFile(statePath, data, 0o600) - require.NoError(t, err) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - var db DeploymentState - err = db.Open(ctx, statePath) - require.NoError(t, err) - - assert.True(t, db.RecoveredFromWAL()) -} - -func TestRecoverFromWAL_LineageAdoption(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"adopted-lineage","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) // empty lineage - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - assert.Equal(t, "adopted-lineage", db.Lineage) -} - -func TestReadWAL_EmptyFile(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - err := os.WriteFile(walFilePath, []byte(""), 0o600) - require.NoError(t, err) - - _, _, _, err = readWAL(statePath) - assert.Error(t, err) - assert.Contains(t, err.Error(), "empty") -} - -func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "111", map[string]string{"v": "1"}, nil) - require.NoError(t, err) - - err = db.DeleteState("resources.jobs.job1") - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - require.Len(t, entries, 3) - assert.Equal(t, "111", entries[0].V.ID) - assert.Nil(t, entries[1].V) - assert.Equal(t, "222", entries[2].V.ID) - - err = db.Finalize() - require.NoError(t, err) - - entry, ok := db.GetResourceEntry("resources.jobs.job1") - require.True(t, ok) - assert.Equal(t, "222", entry.ID) -} - -func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - f, err := os.OpenFile(walFilePath, os.O_WRONLY|os.O_APPEND, 0) - require.NoError(t, err) - _, err = f.WriteString("{\"k\":\"resources.jobs.partial_write\",\"v\":{\"__id__\":\"999\",\"state\":{\"name\":\"partial-\n") - require.NoError(t, err) - require.NoError(t, f.Sync()) - require.NoError(t, f.Close()) - - err = db.Finalize() - require.Error(t, err) - assert.Contains(t, err.Error(), "failed to replay WAL during finalize: corrupted entry at line") - - _, err = os.Stat(walFilePath) - require.NoError(t, err) -} diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index b4d70ede5ad..2a79d6d2098 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -15,6 +15,7 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/metrics" "github.com/databricks/cli/bundle/permissions" @@ -149,6 +150,8 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } + _, localPath := b.StateFilenameDirect(ctx) + if plan != nil { // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) @@ -158,6 +161,12 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } } else { plan = RunPlan(ctx, b, engine) + err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + logdiag.LogError(ctx, err) + return + } + } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 54391ec4d60..75081de56e6 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.OpenStateWithPlanCheck(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted From fb00793df116dc7f3720fbf6c1f1e041b07c17bd Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:40:38 +0100 Subject: [PATCH 16/80] rm unnecessary assert --- bundle/direct/dstate/state.go | 1 - 1 file changed, 1 deletion(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1e882284276..59d0804bbfc 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -381,7 +381,6 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export } func (db *DeploymentState) unlockedSave() error { - db.AssertOpenedForWrite() data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { return err From 9f3d0ec14d2edb6d384bc89af9c639b1e293a905 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sat, 28 Mar 2026 08:05:55 +0100 Subject: [PATCH 17/80] Centralize state open/close lifecycle for direct engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move state open/close management to process.go so the lifecycle is transparent. process.go opens state for read (with WAL recovery) after PullResourcesState and defers close. Deploy/destroy upgrade to write mode via the new UpgradeToWrite() method which initializes the WAL without re-reading state JSON. Internal functions (CalculatePlan, ExportState, InitForApply, ValidatePlanAgainstState) no longer manage their own open/close — they expect state to already be open. Self-managed callers (bind, migrate, yaml_sync, diff) handle their own state lifecycle. Plan command uses ProcessBundleRetWithPlan to compute the plan while state is still open for read inside processBundleRetInternal. Co-authored-by: Isaac --- bundle/configsync/diff.go | 6 ++ bundle/direct/bind.go | 24 ++++- bundle/direct/bundle_apply.go | 2 + bundle/direct/bundle_plan.go | 19 ++-- bundle/direct/dstate/state.go | 73 ++++++++++----- bundle/direct/pkg.go | 3 +- bundle/phases/deploy.go | 43 +++++++-- bundle/phases/destroy.go | 13 +++ .../statemgmt/upload_state_for_yaml_sync.go | 24 ++++- cmd/bundle/deployment/migrate.go | 28 +++++- cmd/bundle/plan.go | 6 +- cmd/bundle/utils/process.go | 91 ++++++++++++------- 12 files changed, 242 insertions(+), 90 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index dee7fa48116..f767966c160 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/dyn" "github.com/databricks/cli/libs/dyn/convert" "github.com/databricks/cli/libs/log" @@ -139,6 +140,11 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy } } + if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return nil, fmt.Errorf("failed to open state: %w", err) + } + defer deployBundle.StateDB.Close(ctx) + plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return nil, fmt.Errorf("failed to calculate plan: %w", err) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 74389313af4..fe8ced6d225 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,8 +62,10 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)); err == nil { - if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { + if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { + existingID := checkStateDB.GetResourceID(resourceKey) + checkStateDB.Close(ctx) + if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, ExistingID: existingID, @@ -105,11 +107,17 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac log.Infof(ctx, "Bound %s to id=%s (in temp state)", resourceKey, resourceID) // First plan + update: populate state with resolved config + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } plan, err := b.CalculatePlan(ctx, client, configRoot) if err != nil { os.Remove(tmpStatePath) return nil, err } + b.StateDB.Close(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -132,6 +140,12 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } } + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(true)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } + err = b.StateDB.SaveState(resourceKey, resourceID, sv.Value, dependsOn) if err != nil { os.Remove(tmpStatePath) @@ -146,7 +160,13 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Second plan: this is the plan to present to the user (change between remote resource and config) + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } plan, err = b.CalculatePlan(ctx, client, configRoot) + b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 6bad8091469..6b84f40775f 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -151,6 +151,8 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) + + // Note: caller is responsible for closing StateDB after Apply returns. } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 1fb70123b96..4f21d0fa066 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -37,24 +37,17 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { return err } -// ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. -// This should be called early in the deployment process, before any file operations. +// ValidatePlanAgainstState validates that a plan's lineage and serial match the given state. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { - // If plan has no lineage, this is a first deployment before any state exists - // No validation needed +func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { if plan.Lineage == "" { return nil } - stateDB.AssertOpened() - - // Validate that the plan's lineage matches the current state's lineage if plan.Lineage != stateDB.Data.Lineage { return fmt.Errorf("plan lineage %q does not match state lineage %q; the state may have been modified by another process", plan.Lineage, stateDB.Data.Lineage) } - // Validate that the plan's serial matches the current state's serial if plan.Serial != stateDB.Data.Serial { return fmt.Errorf("plan serial %d does not match state serial %d; the state has been modified since the plan was created. Please run 'bundle plan' again", plan.Serial, stateDB.Data.Serial) } @@ -63,9 +56,9 @@ func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentSta } // InitForApply initializes the DeploymentBundle for applying a pre-computed plan. -// This is used when --plan is specified to skip the planning phase. +// StateDB must already be open for write before calling this function. func (b *DeploymentBundle) InitForApply(ctx context.Context, client *databricks.WorkspaceClient, plan *deployplan.Plan) error { - b.StateDB.AssertOpened() + b.StateDB.AssertOpenedForWrite() err := b.init(client) if err != nil { @@ -97,8 +90,10 @@ func (b *DeploymentBundle) InitForApply(ctx context.Context, client *databricks. return nil } +// CalculatePlan computes the deployment plan by comparing local config against remote state. +// StateDB must already be open for read before calling this function. func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root) (*deployplan.Plan, error) { - b.StateDB.AssertOpened() + b.StateDB.AssertOpenedForRead() err := b.init(client) if err != nil { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 59d0804bbfc..bd59c131eae 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -228,14 +228,12 @@ func (db *DeploymentState) Reload(ctx context.Context) error { func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix - hasUpdates, err := db.mergeWalIntoState(ctx) + err := db.mergeWalIntoState(ctx) if err != nil { return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) } - if hasUpdates { - if err := db.unlockedSave(); err != nil { - return err - } + if err := db.unlockedSave(); err != nil { + return err } err = os.Remove(walPath) if err != nil { @@ -264,16 +262,15 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { if db.walFile != nil { panic("internal error: walFile must be closed") } - hasUpdates := false walPath := db.Path + walSuffix walFile, err := os.Open(walPath) if err != nil { - return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } defer walFile.Close() @@ -287,17 +284,19 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) if lineNumber == 1 { var header WALHeader if err := json.Unmarshal(line, &header); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL header: %w", err) + return fmt.Errorf("failed to parse WAL header: %w", err) } if err := db.validateWALHeader(ctx, &header); err != nil { - return hasUpdates, err + return err } + // Apply header metadata to state (lineage may be new for first deploy) + db.Data.Lineage = header.Lineage + db.Data.Serial = header.Serial } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + return fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } - hasUpdates = true if entry.Value == nil { delete(db.Data.State, entry.Key) } else { @@ -306,22 +305,19 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } } - if err := scanner.Err(); err != nil { - return hasUpdates, err - } - - if hasUpdates { - // only assume WAL file's serial if we read any data from it - db.Data.Serial += 1 - } - - return hasUpdates, nil + return scanner.Err() } +// Close replays the WAL (if open for write) and resets the state. +// Safe to call multiple times or on an already-closed state. func (db *DeploymentState) Close(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() + if db.Path == "" { + return nil + } + var err error if db.walFile != nil { @@ -337,6 +333,39 @@ func (db *DeploymentState) Close(ctx context.Context) error { return err } +// UpgradeToWrite transitions from read mode to write mode without re-reading state. +// State must already be open for read. This initializes the WAL for writing. +func (db *DeploymentState) UpgradeToWrite() error { + db.mu.Lock() + defer db.mu.Unlock() + + if db.Path == "" { + return fmt.Errorf("internal error: DeploymentState must be opened first") + } + if db.walFile != nil { + return fmt.Errorf("internal error: DeploymentState is already open for write") + } + + walPath := db.Path + walSuffix + walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + if err != nil { + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + } + db.walFile = walFile + + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() + } + walHead := WALHeader{ + Lineage: lineage, + Serial: db.Data.Serial + 1, + StateVersion: currentStateVersion, + CLIVersion: build.GetInfo().Version, + } + return appendJSONLine(db.walFile, walHead) +} + func (db *DeploymentState) AssertOpenedForReadOrWrite() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 58b9bc6b4b1..50beda36f59 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -64,7 +64,8 @@ func (d *DeploymentUnit) SetRemoteState(remoteState any) error { return nil } +// ExportState exports the current deployment state as a resource map. +// StateDB must already be open for read before calling this function. func (b *DeploymentBundle) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { - b.StateDB.AssertOpened() return b.StateDB.ExportState(ctx) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 2a79d6d2098..2b9c115e8af 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -82,6 +82,18 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } + // Close state to replay WAL into state file, then reopen for read. + // PushResourcesState needs the file on disk, Load needs the state in memory. + if targetEngine.IsDirect() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + _, localPath := b.StateFilenameDirect(ctx) + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + logdiag.LogError(ctx, err) + } + } + // Even if deployment failed, there might be updates in states that we need to upload statemgmt.PushResourcesState(ctx, b, targetEngine) if logdiag.HasError(ctx) { @@ -150,9 +162,19 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - _, localPath := b.StateFilenameDirect(ctx) - if plan != nil { + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() + } // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) if err != nil { @@ -160,13 +182,20 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } } else { + // State is already open for read by process.go (for direct engine) plan = RunPlan(ctx, b, engine) - err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - logdiag.LogError(ctx, err) - return + if engine.IsDirect() { + // Upgrade from read to write mode (Apply needs write access) + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() } - } if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 91640ac6cad..fe93d23081b 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -13,6 +13,7 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" @@ -168,6 +169,18 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { } if hasApproval { + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() + } destroyCore(ctx, b, plan, engine) } else { cmdio.LogString(ctx, "Destroy cancelled!") diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 74def3174f8..86c9a0c37b6 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -141,13 +141,17 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun migratedDB := dstate.NewDatabase(tfState.Lineage, tfState.Serial+1) migratedDB.State = state - deploymentBundle := &direct.DeploymentBundle{ - StateDB: dstate.DeploymentState{ - Path: snapshotPath, - Data: migratedDB, - }, + // Write the migrated state to disk so CalculatePlan can read it via Open. + migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") + if err != nil { + return diag.FromErr(fmt.Errorf("marshaling migrated state: %w", err)) + } + if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { + return diag.FromErr(fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err)) } + deploymentBundle := &direct.DeploymentBundle{} + // Apply SecretScopeFixups so the config matches what the direct engine expects. // This adds MANAGE ACL for the current user to all secret scopes, ensuring // the migrated state and config agree on .permissions entries. @@ -173,6 +177,11 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun return false, fmt.Errorf("failed to create uninterpolated config: %w", err) } + if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return diag.FromErr(fmt.Errorf("failed to open state: %w", err)) + } + defer deploymentBundle.StateDB.Close(ctx) + plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { return false, err @@ -197,6 +206,11 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } + err = deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + return diag.FromErr(fmt.Errorf("reopening state for apply: %w", err)) + } + deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) if err := deploymentBundle.StateDB.Finalize(); err != nil { return false, err diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 5020d88e73a..4c657c1166f 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -8,6 +8,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strings" "github.com/databricks/cli/bundle" @@ -227,12 +228,19 @@ To start using direct engine, set "engine: direct" under bundle in your databric migratedDB := dstate.NewDatabase(stateDesc.Lineage, stateDesc.Serial+1) migratedDB.State = state - deploymentBundle := &direct.DeploymentBundle{ - StateDB: dstate.DeploymentState{ - Path: tempStatePath, - Data: migratedDB, - }, + // Write the migrated state to disk so CalculatePlan can read it via Open. + migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") + if err != nil { + return fmt.Errorf("marshaling migrated state: %w", err) + } + if err := os.MkdirAll(filepath.Dir(tempStatePath), 0o755); err != nil { + return fmt.Errorf("creating state directory: %w", err) } + if err := os.WriteFile(tempStatePath, migratedStateJSON, 0o600); err != nil { + return fmt.Errorf("writing migrated state to %s: %w", tempStatePath, err) + } + + deploymentBundle := &direct.DeploymentBundle{} tempStatePathAutoRemove := true @@ -250,6 +258,10 @@ To start using direct engine, set "engine: direct" under bundle in your databric return root.ErrAlreadyPrinted } + if err := deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return fmt.Errorf("failed to open state: %w", err) + } + plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return err @@ -281,6 +293,12 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } + deploymentBundle.StateDB.Close(ctx) + err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + return fmt.Errorf("reopening state for apply: %w", err) + } + deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) if err := deploymentBundle.StateDB.Finalize(); err != nil { logdiag.LogError(ctx, err) diff --git a/cmd/bundle/plan.go b/cmd/bundle/plan.go index e3dd63929ed..d14f820f4e0 100644 --- a/cmd/bundle/plan.go +++ b/cmd/bundle/plan.go @@ -7,7 +7,6 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" - "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/cmd/bundle/utils" "github.com/databricks/cli/cmd/root" "github.com/databricks/cli/libs/flags" @@ -56,14 +55,13 @@ It is useful for previewing changes before running 'bundle deploy'.`, } } - b, stateDesc, err := utils.ProcessBundleRet(cmd, opts) + _, _, plan, err := utils.ProcessBundleRetWithPlan(cmd, opts) if err != nil { return err } ctx := cmd.Context() - plan := phases.RunPlan(ctx, b, stateDesc.Engine) - if logdiag.HasError(ctx) { + if plan == nil || logdiag.HasError(ctx) { return root.ErrAlreadyPrinted } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 75081de56e6..9948b77a342 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -13,6 +13,7 @@ import ( "github.com/databricks/cli/bundle/config/validate" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/root" @@ -75,16 +76,33 @@ type ProcessOptions struct { // (after state is opened and IDs loaded, before deferred Finalize). PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error + // If true, compute the deployment plan and return it via ProcessBundleRetWithPlan. + // The plan is computed after PreDeployChecks while state is still open for read. + ComputePlan bool + + // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, err := ProcessBundleRet(cmd, opts) + b, _, _, err := processBundleRetInternal(cmd, opts) return b, err } -func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { +func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { + b, stateDesc, _, err := processBundleRetInternal(cmd, opts) + return b, stateDesc, err +} + +// ProcessBundleRetWithPlan is like ProcessBundleRet but also computes and returns a deployment plan. +// opts.ComputePlan must be true. +func ProcessBundleRetWithPlan(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, *deployplan.Plan, error) { + opts.ComputePlan = true + return processBundleRetInternal(cmd, opts) +} + +func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { var err error ctx := cmd.Context() if opts.SkipInitContext { @@ -116,20 +134,20 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle } if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } variables, err := cmd.Flags().GetStringSlice("var") if err != nil { logdiag.LogDiag(ctx, diag.FromErr(err)[0]) - return b, nil, err + return b, nil, nil, err } // Initialize variables by assigning them values passed as command line flags configureVariables(cmd, b, variables) if b == nil || logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } ctx = cmd.Context() @@ -152,19 +170,19 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.IncludeLocations { bundle.ApplyContext(ctx, b, mutator.PopulateLocations()) if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } } } if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } if opts.PostInitFunc != nil { err := opts.PostInitFunc(ctx, b) if err != nil { - return b, nil, err + return b, nil, nil, err } } @@ -173,24 +191,27 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if shouldReadState { requiredEngine, err := ResolveEngineSetting(ctx, b) if err != nil { - return b, nil, err + return b, nil, nil, err } // PullResourcesState depends on stateFiler which needs b.Config.Workspace.StatePath which is set in phases.Initialize ctx, stateDesc = statemgmt.PullResourcesState(ctx, b, statemgmt.AlwaysPull(opts.AlwaysPull), requiredEngine) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } cmd.SetContext(ctx) - // Open direct engine state once for all subsequent operations (ExportState, CalculatePlan, Apply, etc.) - needDirectState := stateDesc.Engine.IsDirect() && (opts.InitIDs || opts.ErrorOnEmptyState || opts.Deploy || opts.ReadPlanPath != "" || opts.PreDeployChecks || opts.PostStateFunc != nil) - if needDirectState { + // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. + // Caller is responsible for closing state when done (Deploy closes read + reopens for write). + if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(localPath); err != nil { - logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return b, stateDesc, nil, err } + defer func() { + // Close is idempotent — no-op if already closed by Deploy + b.DeploymentBundle.StateDB.Close(ctx) + }() } // These are not safe in plan/deploy because they insert empty config settings for deleted resources. @@ -208,17 +229,15 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle } bundle.ApplySeqContext(ctx, b, mutators...) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } - var plan *deployplan.Plan - if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } opts.Build = false opts.PreDeployChecks = false @@ -227,7 +246,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle plan, err = deployplan.LoadPlanFromFile(opts.ReadPlanPath) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } currentVersion := build.GetInfo().Version if plan.CLIVersion != currentVersion { @@ -236,10 +255,10 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } else if opts.Deploy { opts.Build = true @@ -255,14 +274,14 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } // Pipeline CLI only validation. if opts.IsPipelinesCLI { rejectDefinitions(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } @@ -270,7 +289,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.Validate { validate.Validate(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -285,7 +304,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -294,7 +313,15 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle phases.PreDeployChecks(ctx, b, downgradeWarningToError, stateDesc.Engine) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted + } + } + + // Compute plan while state is open for read (before Deploy upgrades to write) + if opts.ComputePlan && plan == nil { + plan = phases.RunPlan(ctx, b, stateDesc.Engine) + if logdiag.HasError(ctx) { + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -314,25 +341,25 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } if b != nil && stateDesc != nil && stateDesc.Engine.IsDirect() && stateDesc.HasRemoteTerraformState() { statemgmt.BackupRemoteTerraformState(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } if opts.PostStateFunc != nil { if err := opts.PostStateFunc(ctx, b, stateDesc); err != nil { - return b, stateDesc, err + return b, stateDesc, nil, err } } - return b, stateDesc, nil + return b, stateDesc, plan, nil } // ResolveEngineSetting determines the effective engine setting by combining bundle config and env var. From adf0621da11d7b21cc39aafa32daaa047d796202 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sat, 28 Mar 2026 08:08:09 +0100 Subject: [PATCH 18/80] lint --- bundle/direct/dstate/state.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index bd59c131eae..1433bd35db8 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -4,6 +4,7 @@ import ( "bufio" "context" "encoding/json" + "errors" "fmt" "io/fs" "os" @@ -340,10 +341,10 @@ func (db *DeploymentState) UpgradeToWrite() error { defer db.mu.Unlock() if db.Path == "" { - return fmt.Errorf("internal error: DeploymentState must be opened first") + return errors.New("internal error: DeploymentState must be opened first") } if db.walFile != nil { - return fmt.Errorf("internal error: DeploymentState is already open for write") + return errors.New("internal error: DeploymentState is already open for write") } walPath := db.Path + walSuffix From cdc8e2a3cb76185e7fc5c78fa52b7ffecaf2ce35 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 29 Apr 2026 16:43:00 +0200 Subject: [PATCH 19/80] fixes Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 8 ++--- bundle/direct/dstate/state.go | 8 ++--- bundle/direct/dstate/state_test.go | 33 +++++++++++-------- bundle/phases/deploy.go | 7 ---- bundle/phases/destroy.go | 7 ---- .../statemgmt/upload_state_for_yaml_sync.go | 19 ++++++----- cmd/bundle/deployment/migrate.go | 2 +- cmd/bundle/generate/dashboard.go | 4 ++- 8 files changed, 39 insertions(+), 49 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index f767966c160..1770d945490 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -135,16 +135,12 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy } else { deployBundle = &direct.DeploymentBundle{} _, statePath := b.StateFilenameConfigSnapshot(ctx) - if err := deployBundle.StateDB.Open(statePath); err != nil { + if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } + defer deployBundle.StateDB.Close(ctx) } - if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return nil, fmt.Errorf("failed to open state: %w", err) - } - defer deployBundle.StateDB.Close(ctx) - plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return nil, fmt.Errorf("failed to calculate plan: %w", err) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1433bd35db8..30619277316 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -210,8 +210,6 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } func (db *DeploymentState) Reload(ctx context.Context) error { - - db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) if err != nil { @@ -249,7 +247,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea } if header.StateVersion != db.Data.StateVersion { - return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) + return fmt.Errorf("state_version in the header (%d) does not match the one in the state (%d)", header.StateVersion, db.Data.StateVersion) } if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { @@ -257,7 +255,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea } if header.Serial != db.Data.Serial+1 { - return fmt.Errorf("serial in the header (%q) is not one higher than the one in the state (%q)", header.Serial, db.Data.Serial) + return fmt.Errorf("serial in the header (%d) is not one higher than the one in the state (%d)", header.Serial, db.Data.Serial) } return nil @@ -280,7 +278,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { lineNumber := 0 for scanner.Scan() { - lineNumber += 1 + lineNumber++ line := scanner.Bytes() if lineNumber == 1 { var header WALHeader diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index acd2a9e5336..8f5b04bfe9b 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -8,46 +8,51 @@ import ( "github.com/stretchr/testify/require" ) -func TestOpenSaveFinalizeRoundTrip(t *testing.T) { +func TestOpenCloseRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Finalize()) + require.NoError(t, db.Close(t.Context())) // Re-open and verify persisted data. var db2 DeploymentState - require.NoError(t, db2.Open(path)) + require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) + require.NoError(t, db2.Close(t.Context())) } func TestPanicOnDoubleOpen(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) assert.Panics(t, func() { - _ = db.Open(path) + _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) + db.Close(t.Context()) } func TestDeleteState(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Finalize()) - - require.NoError(t, db.DeleteState("jobs.my_job")) - require.NoError(t, db.Finalize()) + require.NoError(t, db.Close(t.Context())) var db2 DeploymentState - require.NoError(t, db2.Open(path)) - assert.Equal(t, 2, db2.Data.Serial) - assert.Equal(t, "", db2.GetResourceID("jobs.my_job")) + require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) + require.NoError(t, db2.DeleteState("jobs.my_job")) + require.NoError(t, db2.Close(t.Context())) + + var db3 DeploymentState + require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) + assert.Equal(t, 2, db3.Data.Serial) + assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) + require.NoError(t, db3.Close(t.Context())) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 2b9c115e8af..70a81d74607 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -71,13 +71,6 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta if targetEngine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false)) - // Finalize state: write to disk even if deploy failed, so partial progress is saved. - // Skip for empty plans to avoid creating a state file when nothing was deployed. - if len(plan.Plan) > 0 { - if err := b.DeploymentBundle.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } } else { bundle.ApplyContext(ctx, b, terraform.Apply()) } diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index fe93d23081b..b9ff5873bf8 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -13,7 +13,6 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" @@ -77,12 +76,6 @@ func approvalForDestroy(ctx context.Context, b *bundle.Bundle, plan *deployplan. func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, engine engine.EngineType) { if engine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false)) - // Skip Finalize for empty plans to avoid creating a state file when nothing was destroyed. - if len(plan.Plan) > 0 { - if err := b.DeploymentBundle.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } } else { // Core destructive mutators for destroy. These require informed user consent. bundle.ApplyContext(ctx, b, terraform.Apply()) diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 86c9a0c37b6..75314c14223 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -144,10 +144,10 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun // Write the migrated state to disk so CalculatePlan can read it via Open. migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") if err != nil { - return diag.FromErr(fmt.Errorf("marshaling migrated state: %w", err)) + return false, fmt.Errorf("marshaling migrated state: %w", err) } if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { - return diag.FromErr(fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err)) + return false, fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err) } deploymentBundle := &direct.DeploymentBundle{} @@ -178,12 +178,12 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return diag.FromErr(fmt.Errorf("failed to open state: %w", err)) + return false, fmt.Errorf("failed to open state: %w", err) } - defer deploymentBundle.StateDB.Close(ctx) plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { + deploymentBundle.StateDB.Close(ctx) return false, err } @@ -206,13 +206,16 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } - err = deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - return diag.FromErr(fmt.Errorf("reopening state for apply: %w", err)) + // Close read state and reopen for write so Apply can record state changes via WAL. + if err := deploymentBundle.StateDB.Close(ctx); err != nil { + return false, fmt.Errorf("closing state after plan: %w", err) + } + if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { + return false, fmt.Errorf("reopening state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(); err != nil { + if err := deploymentBundle.StateDB.Close(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 4c657c1166f..fb2dbf6ad56 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -300,7 +300,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(); err != nil { + if err := deploymentBundle.StateDB.Close(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 70de46225c2..7c9510de178 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -17,6 +17,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/generate" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" @@ -391,10 +392,11 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(localPath); err != nil { + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { logdiag.LogError(ctx, err) return } + defer b.DeploymentBundle.StateDB.Close(ctx) } bundle.ApplySeqContext(ctx, b, From 9c77b425ae07b5b64d5bd40f6c96e2c60b969d9a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 29 Apr 2026 16:47:12 +0200 Subject: [PATCH 20/80] lint Co-authored-by: Denis Bilenko --- cmd/bundle/generate/dashboard.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 7c9510de178..609b48f9817 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -16,8 +16,8 @@ import ( "time" "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" From 0669b83eb5d2b2703910d1fb3c478697225d4ac9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 11:52:17 +0200 Subject: [PATCH 21/80] restore test --- .../out.deploy.direct.txt | 7 ------- .../out.deploy.terraform.txt | 6 ------ .../artifact_upload_with_no_library_reference/output.txt | 6 ++++++ .../artifact_upload_with_no_library_reference/script | 2 +- .../artifact_upload_with_no_library_reference/test.toml | 5 ----- 5 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt deleted file mode 100644 index f75a5428b16..00000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt deleted file mode 100644 index 8ec9c52db62..00000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6c8bd962a56..6d24880e6c0 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,4 +1,10 @@ +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index fba3a777006..883601185c9 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index 67a9da6c977..a0a680e9d19 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,5 +1,4 @@ RecordRequests = true -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', @@ -9,10 +8,6 @@ Ignore = [ '*.whl', ] -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' - [[Server]] Pattern = "GET /api/2.1/clusters/get" Response.Body = ''' From 57a43a73fae8aa6fc5113c86610af8e033a08d9f Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:01:55 +0200 Subject: [PATCH 22/80] Skip state file write when WAL has no resource entries If only the WAL header was written (no resource changes), replayWAL now discards the WAL without saving the state file. This avoids the spurious "Updating deployment state..." message on no-op deploys in the direct engine. Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state.go | 20 +++++++++++--------- bundle/direct/dstate/state_test.go | 12 ++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 30619277316..b0110d519df 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -227,12 +227,14 @@ func (db *DeploymentState) Reload(ctx context.Context) error { func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix - err := db.mergeWalIntoState(ctx) + hasEntries, err := db.mergeWalIntoState(ctx) if err != nil { return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) } - if err := db.unlockedSave(); err != nil { - return err + if hasEntries { + if err := db.unlockedSave(); err != nil { + return err + } } err = os.Remove(walPath) if err != nil { @@ -261,7 +263,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { if db.walFile != nil { panic("internal error: walFile must be closed") } @@ -269,7 +271,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { walPath := db.Path + walSuffix walFile, err := os.Open(walPath) if err != nil { - return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } defer walFile.Close() @@ -283,10 +285,10 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { if lineNumber == 1 { var header WALHeader if err := json.Unmarshal(line, &header); err != nil { - return fmt.Errorf("failed to parse WAL header: %w", err) + return false, fmt.Errorf("failed to parse WAL header: %w", err) } if err := db.validateWALHeader(ctx, &header); err != nil { - return err + return false, err } // Apply header metadata to state (lineage may be new for first deploy) db.Data.Lineage = header.Lineage @@ -294,7 +296,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + return false, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } if entry.Value == nil { delete(db.Data.State, entry.Key) @@ -304,7 +306,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { } } - return scanner.Err() + return lineNumber > 1, scanner.Err() } // Close replays the WAL (if open for write) and resets the state. diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 8f5b04bfe9b..8e817dd1988 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -1,6 +1,7 @@ package dstate import ( + "os" "path/filepath" "testing" @@ -25,6 +26,17 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db2.Close(t.Context())) } +func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "state.json") + + var db DeploymentState + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) + require.NoError(t, db.Close(t.Context())) + + _, err := os.Stat(path) + assert.ErrorIs(t, err, os.ErrNotExist) +} + func TestPanicOnDoubleOpen(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") From f7d6a5c746831c131fb693af9d646650a368ad45 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:07:47 +0200 Subject: [PATCH 23/80] Revert per-engine test splits for no-resource deploys The splits were introduced because the direct engine printed "Updating deployment state..." on deploys with no resource changes, while terraform did not. The preceding commit fixes the root cause (WAL without entries no longer writes the state file), so both engines now produce identical output for these tests. Co-authored-by: Denis Bilenko --- .../build_and_files_whl/out.deploy.direct.txt | 8 ------- .../out.deploy.terraform.txt | 7 ------ .../artifacts/build_and_files_whl/output.txt | 7 ++++++ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 4 ---- .../shell/bash/out.deploy.direct.txt | 7 ------ .../shell/bash/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/bash/output.txt | 5 ++++ acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ------ .../shell/basic/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/basic/output.txt | 5 ++++ .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ------ .../shell/default/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/default/output.txt | 5 ++++ .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ------ .../shell/sh/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/sh/output.txt | 5 ++++ acceptance/bundle/artifacts/shell/sh/script | 2 +- acceptance/bundle/artifacts/shell/test.toml | 4 ---- .../deploy/empty-bundle/out.deploy.direct.txt | 6 ----- .../empty-bundle/out.deploy.terraform.txt | 5 ---- .../bundle/deploy/empty-bundle/output.txt | 5 ++++ acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/empty-bundle/test.toml | 3 --- .../bundle/scripts/out.deploy.direct.txt | 24 ------------------- .../bundle/scripts/out.deploy.terraform.txt | 23 ------------------ acceptance/bundle/scripts/output.txt | 23 ++++++++++++++++++ .../out.deploy.direct.txt | 18 -------------- .../out.deploy.terraform.txt | 17 ------------- .../scripts/restricted-execution/output.txt | 17 +++++++++++++ .../scripts/restricted-execution/script | 2 +- .../scripts/restricted-execution/test.toml | 3 --- acceptance/bundle/scripts/script | 2 +- acceptance/bundle/scripts/test.toml | 3 --- .../out.deploy-one.direct.txt | 6 ----- .../out.deploy-one.terraform.txt | 5 ---- .../out.deploy-two.direct.txt | 6 ----- .../out.deploy-two.terraform.txt | 5 ---- .../deploy-artifact-path-type/output.txt | 10 ++++++++ .../deploy-artifact-path-type/script | 4 ++-- .../deploy-artifact-path-type/test.toml | 4 ---- .../out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../deploy-config-file-count/output.txt | 5 ++++ .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-config-file-count/test.toml | 3 --- .../deploy-mode/out.deploy-dev.direct.txt | 6 ----- .../deploy-mode/out.deploy-dev.terraform.txt | 5 ---- .../deploy-mode/out.deploy-prod.direct.txt | 12 ---------- .../deploy-mode/out.deploy-prod.terraform.txt | 11 --------- .../bundle/telemetry/deploy-mode/output.txt | 16 +++++++++++++ .../bundle/telemetry/deploy-mode/script | 4 ++-- .../bundle/telemetry/deploy-mode/test.toml | 3 --- .../deploy-target-count/out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../telemetry/deploy-target-count/output.txt | 5 ++++ .../telemetry/deploy-target-count/script | 2 +- .../telemetry/deploy-target-count/test.toml | 3 --- .../out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../deploy-variable-count/output.txt | 5 ++++ .../telemetry/deploy-variable-count/script | 2 +- .../telemetry/deploy-variable-count/test.toml | 4 ---- .../out.deploy-one.direct.txt | 8 ------- .../out.deploy-one.terraform.txt | 7 ------ .../out.deploy-two.direct.txt | 8 ------- .../out.deploy-two.terraform.txt | 7 ------ .../telemetry/deploy-whl-artifacts/output.txt | 14 +++++++++++ .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../telemetry/deploy-whl-artifacts/test.toml | 4 ---- .../sync_patterns/out.deploy.direct.txt | 6 ----- .../sync_patterns/out.deploy.terraform.txt | 5 ---- .../bundle/validate/sync_patterns/output.txt | 5 ++++ .../bundle/validate/sync_patterns/script | 2 +- .../bundle/validate/sync_patterns/test.toml | 4 ---- acceptance/cache/simple/out.deploy.direct.txt | 6 ----- .../cache/simple/out.deploy.terraform.txt | 5 ---- acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 7 ------ 82 files changed, 151 insertions(+), 369 deletions(-) delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/test.toml delete mode 100644 acceptance/bundle/scripts/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt delete mode 100644 acceptance/cache/simple/out.deploy.direct.txt delete mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt deleted file mode 100644 index 4039d5917e8..00000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt deleted file mode 100644 index 9894e5b89ff..00000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index d44a21b582a..b618de6b89a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,3 +7,10 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 9aa0d870e7a..2d7d63f7fec 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index 8b65645e5a3..a030353d571 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1,5 +1 @@ RecordRequests = false - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt deleted file mode 100644 index f311959abdd..00000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt deleted file mode 100644 index fa5d7b76bcd..00000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index 8b137891791..fa5d7b76bcd 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 09bb41643ca..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt deleted file mode 100644 index 3a4ff9138ba..00000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt deleted file mode 100644 index b5e01c79e67..00000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index 8b137891791..b5e01c79e67 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 09bb41643ca..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt deleted file mode 100644 index f311959abdd..00000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt deleted file mode 100644 index fa5d7b76bcd..00000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index 8b137891791..fa5d7b76bcd 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 09bb41643ca..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt deleted file mode 100644 index 98820986f53..00000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt deleted file mode 100644 index 5117e6e9fc0..00000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 8b137891791..5117e6e9fc0 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 09bb41643ca..68ebb78d775 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/test.toml b/acceptance/bundle/artifacts/shell/test.toml index df72afb6c82..9796804e9a9 100644 --- a/acceptance/bundle/artifacts/shell/test.toml +++ b/acceptance/bundle/artifacts/shell/test.toml @@ -1,7 +1,3 @@ Local = true Cloud = false RecordRequests = false - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt deleted file mode 100644 index 81dddfcb9fc..00000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt deleted file mode 100644 index 494f76c84fa..00000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 8498653a6e7..919accb661f 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! + >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index b74818f1b1a..775ccd0defc 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/test.toml b/acceptance/bundle/deploy/empty-bundle/test.toml index 84da5529dc0..f64800a1636 100644 --- a/acceptance/bundle/deploy/empty-bundle/test.toml +++ b/acceptance/bundle/deploy/empty-bundle/test.toml @@ -2,6 +2,3 @@ Cloud = true [EnvMatrix] DATABRICKS_BUNDLE_ENABLE_EXPERIMENTAL_YAML_SYNC = ["", "true"] -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt deleted file mode 100644 index 037f609f944..00000000000 --- a/acceptance/bundle/scripts/out.deploy.direct.txt +++ /dev/null @@ -1,24 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt deleted file mode 100644 index a3d9ba342c2..00000000000 --- a/acceptance/bundle/scripts/out.deploy.terraform.txt +++ /dev/null @@ -1,23 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index a39a0b0aa9b..68afb2feccb 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,3 +25,26 @@ Name: scripts Found 1 error Exit code: 1 + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt deleted file mode 100644 index d8fed9e4e6c..00000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt +++ /dev/null @@ -1,18 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt deleted file mode 100644 index efcf1281cb7..00000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt +++ /dev/null @@ -1,17 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index 2186ac68f02..f377edba7cb 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,5 +1,22 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env + === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 2e31cce2eea..7a3dcb068b4 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/restricted-execution/test.toml b/acceptance/bundle/scripts/restricted-execution/test.toml deleted file mode 100644 index 2a2e9c20339..00000000000 --- a/acceptance/bundle/scripts/restricted-execution/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index 3acb85f9cd1..de07d277ea9 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace EXITCODE=0 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/test.toml b/acceptance/bundle/scripts/test.toml deleted file mode 100644 index 2a2e9c20339..00000000000 --- a/acceptance/bundle/scripts/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt deleted file mode 100644 index 0e133547de1..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt deleted file mode 100644 index 65960fa86d5..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt deleted file mode 100644 index 120e5902015..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt deleted file mode 100644 index fabdebb399f..00000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index 69c6730b46a..a03920c3fdc 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,4 +1,14 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index 4f3bd7c3cf4..d1a63928a67 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml index d4126948d39..32b75237a12 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml @@ -20,7 +20,3 @@ Response.Body = '{}' # I'm adding 405 because that's what this test originally do. It's somewhat # surprising though that CLI can receive 405 and that does not result in error anywhere. Response.StatusCode = 405 - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 1637965310c..909e8d6c705 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index 7fbdd0e6776..c495bdcb071 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml deleted file mode 100644 index 2a2e9c20339..00000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt deleted file mode 100644 index e86795abf5d..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt deleted file mode 100644 index ee47fabbb63..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt deleted file mode 100644 index 5957e33b910..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt +++ /dev/null @@ -1,12 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt deleted file mode 100644 index ac2e13efb95..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt +++ /dev/null @@ -1,11 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 89be65f1950..99e7fbb699a 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,4 +1,20 @@ +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index 0a9d57a1a43..f7257769ac1 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t dev -trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t prod trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-mode/test.toml b/acceptance/bundle/telemetry/deploy-mode/test.toml deleted file mode 100644 index 2a2e9c20339..00000000000 --- a/acceptance/bundle/telemetry/deploy-mode/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt deleted file mode 100644 index 0e133547de1..00000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt deleted file mode 100644 index 65960fa86d5..00000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 9c59c430234..31581169f2c 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 6e9d2f7378c..3022a2b5e49 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/test.toml b/acceptance/bundle/telemetry/deploy-target-count/test.toml deleted file mode 100644 index 2a2e9c20339..00000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index e8580d71b39..be4840e69ef 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index caaf8c1f39f..dad762899a2 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/test.toml b/acceptance/bundle/telemetry/deploy-variable-count/test.toml index 0a40c794b3a..855ecdd39ee 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/test.toml +++ b/acceptance/bundle/telemetry/deploy-variable-count/test.toml @@ -14,7 +14,3 @@ Response.Body = ''' ] } ''' - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt deleted file mode 100644 index f8db617c003..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt deleted file mode 100644 index 048d0f07b50..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt deleted file mode 100644 index b786de11fed..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt deleted file mode 100644 index 651d315f77c..00000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index ed89628d989..a9b8ce4ae6e 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,4 +1,18 @@ +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 5bc513afb87..078fa94cdd3 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml index 317e12a834d..0d481507067 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml @@ -6,7 +6,3 @@ Ignore = [ '.databricks', "__pycache__", ] - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b9169..00000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37b..00000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index 0c061fbe312..b35859d86a9 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,3 +20,8 @@ Validation OK! "." ] } + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index 485556d28a6..d2aae85444a 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/bundle/validate/sync_patterns/test.toml b/acceptance/bundle/validate/sync_patterns/test.toml index abc1014fd61..159efe02696 100644 --- a/acceptance/bundle/validate/sync_patterns/test.toml +++ b/acceptance/bundle/validate/sync_patterns/test.toml @@ -1,5 +1 @@ RecordRequests = true - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt deleted file mode 100644 index 945da6d1443..00000000000 --- a/acceptance/cache/simple/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt deleted file mode 100644 index 41cfbc2a2d3..00000000000 --- a/acceptance/cache/simple/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index 524c077f460..a2907174bf3 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -p dogfood trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 2601c79f825..08cabc87be6 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,9 +3,6 @@ Local = true RecordRequests = true -# Enable engine-specific output files -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" - # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' @@ -14,7 +11,3 @@ New = '' [[Repls]] Old = ' mutator=[A-Za-z]+' New = '' - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' From 8c38c727c77f88f22691ca0a1c1f3fa6599f6c22 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:16:13 +0200 Subject: [PATCH 24/80] fmt Co-authored-by: Denis Bilenko --- cmd/bundle/utils/process.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 9948b77a342..1e522343427 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,7 +80,6 @@ type ProcessOptions struct { // The plan is computed after PreDeployChecks while state is still open for read. ComputePlan bool - // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } From b0bad1a3a945c95d23c5f4589a06ec7ce5eac4a3 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:20:17 +0200 Subject: [PATCH 25/80] Maintain stateIDs as single source of truth for resource IDs Populate stateIDs from State on Reload so it always mirrors the effective view: initialized from disk, updated by SaveState/DeleteState. GetResourceID now consults stateIDs unconditionally instead of checking walFile and falling back to State. Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state.go | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index b0110d519df..9b43f50610a 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -132,23 +132,7 @@ func (db *DeploymentState) GetResourceID(key string) string { db.mu.Lock() defer db.mu.Unlock() - if db.walFile != nil { - // in write-mode new IDs are written to WAL and stored in this map - id := db.stateIDs[key] - if id != "" { - return id - } - } - - // in read mode State is the source of IDs for all requests - // in write mode State is the source of IDs for all resources that were not updated - - if db.Data.State == nil { - return "" - } - - entry := db.Data.State[key] - return entry.ID + return db.stateIDs[key] } type ( @@ -220,7 +204,12 @@ func (db *DeploymentState) Reload(ctx context.Context) error { return err } } else { - return json.Unmarshal(data, &db.Data) + if err := json.Unmarshal(data, &db.Data); err != nil { + return err + } + } + for key, entry := range db.Data.State { + db.stateIDs[key] = entry.ID } return nil } From 44f6141a7a7a187ad400ba4bc8f3b958d74d13b9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:35:53 +0200 Subject: [PATCH 26/80] Remove defer Close from processBundleRetInternal; align with main approach State opened for read in ProcessBundleRet stays open after return. Deploy and Destroy call UpgradeToWrite + Close internally, so no defensive defer is needed. plan.go reverts to the two-step pattern from main: ProcessBundleRet then phases.RunPlan. ProcessBundleRetWithPlan and opts.ComputePlan are removed. Co-authored-by: Denis Bilenko --- cmd/bundle/plan.go | 6 ++++-- cmd/bundle/utils/process.go | 27 ++------------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/cmd/bundle/plan.go b/cmd/bundle/plan.go index d14f820f4e0..e3dd63929ed 100644 --- a/cmd/bundle/plan.go +++ b/cmd/bundle/plan.go @@ -7,6 +7,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/cmd/bundle/utils" "github.com/databricks/cli/cmd/root" "github.com/databricks/cli/libs/flags" @@ -55,13 +56,14 @@ It is useful for previewing changes before running 'bundle deploy'.`, } } - _, _, plan, err := utils.ProcessBundleRetWithPlan(cmd, opts) + b, stateDesc, err := utils.ProcessBundleRet(cmd, opts) if err != nil { return err } ctx := cmd.Context() - if plan == nil || logdiag.HasError(ctx) { + plan := phases.RunPlan(ctx, b, stateDesc.Engine) + if logdiag.HasError(ctx) { return root.ErrAlreadyPrinted } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 1e522343427..fbc662e0fee 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -72,14 +72,9 @@ type ProcessOptions struct { // When set, skips Build and PreDeployChecks phases, loads plan from file instead of calculating. ReadPlanPath string - // PostStateFunc is called at the end of ProcessBundleRet, within the state lifecycle scope - // (after state is opened and IDs loaded, before deferred Finalize). + // PostStateFunc is called at the end of ProcessBundleRet, while state is still open. PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error - // If true, compute the deployment plan and return it via ProcessBundleRetWithPlan. - // The plan is computed after PreDeployChecks while state is still open for read. - ComputePlan bool - // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } @@ -94,12 +89,6 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, return b, stateDesc, err } -// ProcessBundleRetWithPlan is like ProcessBundleRet but also computes and returns a deployment plan. -// opts.ComputePlan must be true. -func ProcessBundleRetWithPlan(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, *deployplan.Plan, error) { - opts.ComputePlan = true - return processBundleRetInternal(cmd, opts) -} func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { var err error @@ -201,16 +190,12 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl cmd.SetContext(ctx) // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. - // Caller is responsible for closing state when done (Deploy closes read + reopens for write). + // Caller is responsible for closing state when done (Deploy/Destroy upgrade to write and close). if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return b, stateDesc, nil, err } - defer func() { - // Close is idempotent — no-op if already closed by Deploy - b.DeploymentBundle.StateDB.Close(ctx) - }() } // These are not safe in plan/deploy because they insert empty config settings for deleted resources. @@ -316,14 +301,6 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } } - // Compute plan while state is open for read (before Deploy upgrades to write) - if opts.ComputePlan && plan == nil { - plan = phases.RunPlan(ctx, b, stateDesc.Engine) - if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted - } - } - if opts.Deploy { var outputHandler sync.OutputHandler if opts.Verbose { From 13d2ae9b08ce4d3051a50d4268f299365d9695f6 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:44:05 +0200 Subject: [PATCH 27/80] Rename Close to Finalize; make plan a local var in processBundleRetInternal Close(ctx) -> Finalize(ctx) to match main's naming. plan was a named return in processBundleRetInternal only to support the now-removed ProcessBundleRetWithPlan; demote it to a local variable. Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 2 +- bundle/direct/bind.go | 12 ++--- bundle/direct/dstate/state.go | 6 +-- bundle/direct/dstate/state_test.go | 14 ++--- bundle/phases/deploy.go | 6 +-- bundle/phases/destroy.go | 2 +- .../statemgmt/upload_state_for_yaml_sync.go | 6 +-- cmd/bundle/deployment/migrate.go | 4 +- cmd/bundle/generate/dashboard.go | 2 +- cmd/bundle/utils/process.go | 52 +++++++++---------- 10 files changed, 53 insertions(+), 53 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index 1770d945490..5b2d5cfd156 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -138,7 +138,7 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } - defer deployBundle.StateDB.Close(ctx) + defer deployBundle.StateDB.Finalize(ctx) } plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index fe8ced6d225..693d613bed9 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - checkStateDB.Close(ctx) + checkStateDB.Finalize(ctx) if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -98,7 +98,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Close(ctx) + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -117,7 +117,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - b.StateDB.Close(ctx) + b.StateDB.Finalize(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -152,7 +152,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Close(ctx) + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -166,7 +166,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - b.StateDB.Close(ctx) + b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -236,5 +236,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Close(ctx) + return b.StateDB.Finalize(ctx) } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9b43f50610a..90f8ca07fc8 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -298,9 +298,9 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) return lineNumber > 1, scanner.Err() } -// Close replays the WAL (if open for write) and resets the state. -// Safe to call multiple times or on an already-closed state. -func (db *DeploymentState) Close(ctx context.Context) error { +// Finalize replays the WAL (if open for write) and resets the state. +// Safe to call multiple times or on an already-finalized state. +func (db *DeploymentState) Finalize(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 8e817dd1988..99efda82b4f 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -16,14 +16,14 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) // Re-open and verify persisted data. var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) - require.NoError(t, db2.Close(t.Context())) + require.NoError(t, db2.Finalize(t.Context())) } func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { @@ -31,7 +31,7 @@ func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) _, err := os.Stat(path) assert.ErrorIs(t, err, os.ErrNotExist) @@ -46,7 +46,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - db.Close(t.Context()) + db.Finalize(t.Context()) } func TestDeleteState(t *testing.T) { @@ -55,16 +55,16 @@ func TestDeleteState(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db2.DeleteState("jobs.my_job")) - require.NoError(t, db2.Close(t.Context())) + require.NoError(t, db2.Finalize(t.Context())) var db3 DeploymentState require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 2, db3.Data.Serial) assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) - require.NoError(t, db3.Close(t.Context())) + require.NoError(t, db3.Finalize(t.Context())) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 70a81d74607..a11fe4bc12b 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -78,7 +78,7 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta // Close state to replay WAL into state file, then reopen for read. // PushResourcesState needs the file on disk, Load needs the state in memory. if targetEngine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } _, localPath := b.StateFilenameDirect(ctx) @@ -163,7 +163,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() @@ -184,7 +184,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index b9ff5873bf8..3721f6a8835 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -169,7 +169,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 75314c14223..645069e2814 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -183,7 +183,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { - deploymentBundle.StateDB.Close(ctx) + deploymentBundle.StateDB.Finalize(ctx) return false, err } @@ -207,7 +207,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } // Close read state and reopen for write so Apply can record state changes via WAL. - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, fmt.Errorf("closing state after plan: %w", err) } if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { @@ -215,7 +215,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index fb2dbf6ad56..fddfa55d7d4 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -293,14 +293,14 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - deploymentBundle.StateDB.Close(ctx) + deploymentBundle.StateDB.Finalize(ctx) err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) if err != nil { return fmt.Errorf("reopening state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 609b48f9817..500f67351e4 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -396,7 +396,7 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { logdiag.LogError(ctx, err) return } - defer b.DeploymentBundle.StateDB.Close(ctx) + defer b.DeploymentBundle.StateDB.Finalize(ctx) } bundle.ApplySeqContext(ctx, b, diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index fbc662e0fee..ad38e28d66d 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,18 +80,18 @@ type ProcessOptions struct { } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, _, err := processBundleRetInternal(cmd, opts) + b, _, err := processBundleRetInternal(cmd, opts) return b, err } func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { - b, stateDesc, _, err := processBundleRetInternal(cmd, opts) + b, stateDesc, err := processBundleRetInternal(cmd, opts) return b, stateDesc, err } - -func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { +func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { var err error + var plan *deployplan.Plan ctx := cmd.Context() if opts.SkipInitContext { if !logdiag.IsSetup(ctx) { @@ -122,20 +122,20 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } variables, err := cmd.Flags().GetStringSlice("var") if err != nil { logdiag.LogDiag(ctx, diag.FromErr(err)[0]) - return b, nil, nil, err + return b, nil, err } // Initialize variables by assigning them values passed as command line flags configureVariables(cmd, b, variables) if b == nil || logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } ctx = cmd.Context() @@ -158,19 +158,19 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.IncludeLocations { bundle.ApplyContext(ctx, b, mutator.PopulateLocations()) if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } } } if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } if opts.PostInitFunc != nil { err := opts.PostInitFunc(ctx, b) if err != nil { - return b, nil, nil, err + return b, nil, err } } @@ -179,13 +179,13 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if shouldReadState { requiredEngine, err := ResolveEngineSetting(ctx, b) if err != nil { - return b, nil, nil, err + return b, nil, err } // PullResourcesState depends on stateFiler which needs b.Config.Workspace.StatePath which is set in phases.Initialize ctx, stateDesc = statemgmt.PullResourcesState(ctx, b, statemgmt.AlwaysPull(opts.AlwaysPull), requiredEngine) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } cmd.SetContext(ctx) @@ -194,7 +194,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return b, stateDesc, nil, err + return b, stateDesc, err } } @@ -213,7 +213,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } bundle.ApplySeqContext(ctx, b, mutators...) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } @@ -221,7 +221,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } opts.Build = false opts.PreDeployChecks = false @@ -230,7 +230,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl plan, err = deployplan.LoadPlanFromFile(opts.ReadPlanPath) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } currentVersion := build.GetInfo().Version if plan.CLIVersion != currentVersion { @@ -242,7 +242,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } else if opts.Deploy { opts.Build = true @@ -258,14 +258,14 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } // Pipeline CLI only validation. if opts.IsPipelinesCLI { rejectDefinitions(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } @@ -273,7 +273,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.Validate { validate.Validate(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -288,7 +288,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -297,7 +297,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl phases.PreDeployChecks(ctx, b, downgradeWarningToError, stateDesc.Engine) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -317,25 +317,25 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } if b != nil && stateDesc != nil && stateDesc.Engine.IsDirect() && stateDesc.HasRemoteTerraformState() { statemgmt.BackupRemoteTerraformState(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } if opts.PostStateFunc != nil { if err := opts.PostStateFunc(ctx, b, stateDesc); err != nil { - return b, stateDesc, nil, err + return b, stateDesc, err } } - return b, stateDesc, plan, nil + return b, stateDesc, nil } // ResolveEngineSetting determines the effective engine setting by combining bundle config and env var. From 34403eaf967acfbb101568e877f307b3faac4508 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:50:10 +0200 Subject: [PATCH 28/80] Restore process.go structure to match main more closely - Collapse processBundleRetInternal back into ProcessBundleRet (named returns) - ProcessBundle calls ProcessBundleRet like on main - Restore needDirectState guard so state is only opened when needed - Move var plan back after the state block Co-authored-by: Denis Bilenko --- cmd/bundle/utils/process.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index ad38e28d66d..f7663057c2d 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,18 +80,12 @@ type ProcessOptions struct { } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, err := processBundleRetInternal(cmd, opts) + b, _, err := ProcessBundleRet(cmd, opts) return b, err } -func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { - b, stateDesc, err := processBundleRetInternal(cmd, opts) - return b, stateDesc, err -} - -func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { +func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { var err error - var plan *deployplan.Plan ctx := cmd.Context() if opts.SkipInitContext { if !logdiag.IsSetup(ctx) { @@ -189,12 +183,13 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } cmd.SetContext(ctx) - // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. - // Caller is responsible for closing state when done (Deploy/Destroy upgrade to write and close). - if stateDesc.Engine.IsDirect() { + // Open direct engine state once for all subsequent operations (ExportState, CalculatePlan, Apply, etc.) + needDirectState := stateDesc.Engine.IsDirect() && (opts.InitIDs || opts.ErrorOnEmptyState || opts.Deploy || opts.ReadPlanPath != "" || opts.PreDeployChecks || opts.PostStateFunc != nil) + if needDirectState { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return b, stateDesc, err + logdiag.LogError(ctx, err) + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -218,6 +213,8 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } } + var plan *deployplan.Plan + if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) From 02cd4af45543ad380465dfe72155bb07f2a39380 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 13:12:17 +0200 Subject: [PATCH 29/80] Fix migration count, remove unnecessary defer Finalize, fix errcheck - migrate.go: use len(state) instead of len(StateDB.Data.State) since Finalize() resets Data after saving; fixes "Migrated 0 resources" regression - dashboard.go, diff.go: remove unnecessary defer StateDB.Finalize for read-only opens - no WAL file is open so no cleanup is needed - bind.go, state_test.go, upload_state_for_yaml_sync.go, migrate.go: fix errcheck lint issues on Finalize calls that cannot return error in read mode Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 1 - bundle/direct/bind.go | 6 +++--- bundle/direct/dstate/state_test.go | 2 +- bundle/statemgmt/upload_state_for_yaml_sync.go | 2 +- cmd/bundle/deployment/migrate.go | 4 ++-- cmd/bundle/generate/dashboard.go | 1 - 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index 5b2d5cfd156..b02cd345e1f 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -138,7 +138,6 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } - defer deployBundle.StateDB.Finalize(ctx) } plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 693d613bed9..7e32bfd6479 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - checkStateDB.Finalize(ctx) + _ = checkStateDB.Finalize(ctx) if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -117,7 +117,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - b.StateDB.Finalize(ctx) + _ = b.StateDB.Finalize(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -166,7 +166,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - b.StateDB.Finalize(ctx) + _ = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 99efda82b4f..b493258b2f2 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -46,7 +46,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - db.Finalize(t.Context()) + require.NoError(t, db.Finalize(t.Context())) } func TestDeleteState(t *testing.T) { diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 645069e2814..a89433964c5 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -182,8 +182,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) + _ = deploymentBundle.StateDB.Finalize(ctx) if err != nil { - deploymentBundle.StateDB.Finalize(ctx) return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index fddfa55d7d4..3b1f003f599 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -293,7 +293,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - deploymentBundle.StateDB.Finalize(ctx) + _ = deploymentBundle.StateDB.Finalize(ctx) err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) if err != nil { return fmt.Errorf("reopening state for apply: %w", err) @@ -328,7 +328,7 @@ Validate the migration by running "databricks bundle plan%s", there should be no The state file is not synchronized to the workspace yet. To do that and finalize the migration, run "bundle deploy%s". To undo the migration, remove %s and rename %s to %s -`, len(deploymentBundle.StateDB.Data.State), localPath, extraArgsStr, extraArgsStr, localPath, localTerraformBackupPath, localTerraformPath)) +`, len(state), localPath, extraArgsStr, extraArgsStr, localPath, localTerraformBackupPath, localTerraformPath)) return nil } diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 500f67351e4..ca02cc414ea 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -396,7 +396,6 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { logdiag.LogError(ctx, err) return } - defer b.DeploymentBundle.StateDB.Finalize(ctx) } bundle.ApplySeqContext(ctx, b, From dc973e1a51297331a731edbc80346a68e988d1b1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 14:29:40 +0200 Subject: [PATCH 30/80] Fix WAL validation: lowercase suffix, partial recovery, directory creation Key fixes to bundle/direct/dstate/state.go: - Change walSuffix from ".WAL" to ".wal" for case-sensitive filesystem compatibility (Linux). All acceptance tests create .wal files. - Remove CLIVersion and StateVersion checks from validateWALHeader. These fields are informational metadata and should not gate WAL recovery. Pre-created test WAL files without these fields now validate correctly. - Fix validateWALHeader error messages: - Lineage mismatch: "WAL lineage (%s) does not match state lineage (%s)" - Stale serial (< expected): return errStaleWAL sentinel - Future serial (> expected): "WAL serial (%d) is ahead of expected (%d), state may be corrupted" - Add errStaleWAL sentinel so replayWAL can silently delete stale WALs instead of returning an error that fails the deploy. - Fix mergeWalIntoState for partial recovery: - Skip corrupted entries with log.Warnf instead of failing immediately - Save corrupted lines to PATH.wal.corrupted for debugging - Update db.stateIDs alongside db.Data.State when applying WAL entries - Add MkdirAll before WAL file creation in Open and UpgradeToWrite. Fixes bind failing on first use when the state directory doesn't exist yet. - Wrap replayWAL errors as "WAL recovery failed: %w". - Wrap Open's replayWAL call as "reading state from %s: %w". Update acceptance test expected outputs accordingly: - WAL tests: new error messages, partial recovery behavior, stale WAL handling - Migrate tests: migration count now correct (was 0 due to earlier fix) - Bind tests: now succeed when state directory didn't exist before - State/deploy failure tests: "Updating deployment state..." removed when no state was written (no-change or failed deploys) Co-authored-by: Isaac --- .../deploy/wal/chain-10-jobs/output.txt | 6 +- .../deploy/wal/corrupted-wal-entry/output.txt | 9 +-- .../wal/corrupted-wal-middle/output.txt | 9 +-- .../deploy/wal/crash-after-create/output.txt | 26 +------ .../bundle/deploy/wal/empty-wal/output.txt | 3 +- .../deploy/wal/future-serial-wal/output.txt | 1 - .../deploy/wal/lineage-mismatch/output.txt | 1 - .../deploy/wal/multiple-crashes/output.txt | 5 +- .../deploy/wal/summary-after-crash/output.txt | 17 +--- .../deploy/wal/wal-with-delete/output.txt | 2 - .../bundle/migrate/basic/out.plan_update.json | 2 +- acceptance/bundle/migrate/basic/output.txt | 10 +-- .../dashboards/out.plan_after_migrate.json | 2 +- .../bundle/migrate/dashboards/output.txt | 8 +- acceptance/bundle/migrate/grants/output.txt | 8 +- .../bundle/migrate/permissions/output.txt | 8 +- .../out.deploy.direct.txt | 1 - .../resources/jobs/create-error/output.txt | 1 - .../jobs/update/out.plan_update.direct.json | 2 +- .../without_project_id/out.deploy.direct.txt | 1 - .../bundle/state/state_present/output.txt | 8 +- bundle/direct/dstate/state.go | 77 +++++++++++++------ 22 files changed, 97 insertions(+), 110 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index b172c4fc060..818bf13b251 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -9,8 +9,10 @@ Exit code: [KILLED] === WAL content after crash === { + "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": [SERIAL] + "serial": [SERIAL], + "state_version": 2 } { "k": "resources.jobs.job_01", @@ -362,8 +364,6 @@ Exit code: [KILLED] 9 === Bundle summary (reads from WAL) === -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 9 entries from WAL file. Name: wal-chain-test Target: default Workspace: diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index ee28d6391e4..aad802f749b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -8,11 +8,9 @@ === Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy +Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input +Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4 -Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -25,6 +23,5 @@ Deployment complete! ] } === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== WAL after successful deploy === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index ffc7ef7d04d..ff13944ae4e 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -8,11 +8,9 @@ === Deploy (should recover valid entries and skip corrupted line) === >>> [CLI] bundle deploy +Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:3: unexpected end of JSON input +Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3 -Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -25,6 +23,5 @@ Deployment complete! ] } === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== WAL after deploy === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index e32c251ae4e..cf9230983c7 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -8,29 +8,9 @@ Deploying resources... Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === -{ - "serial": [SERIAL], - "state_keys": [] -} -=== Second deploy (should recover from WAL and complete) === +cat: .databricks/bundle/default/resources.json: No such file or directory ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. -Deploying resources... -Updating deployment state... -Deployment complete! -=== State file after recovery === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_a", - "resources.jobs.job_b" - ] -} -=== WAL file after successful deploy === -WAL file deleted (expected) +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index e8e1553df78..26117a2d368 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -6,14 +6,13 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... -Warn: Failed to read WAL file, moved it to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) === Corrupted WAL file === -[FILE_INFO] .databricks/bundle/default/resources.json.wal.corrupted +Corrupted WAL file missing (unexpected) === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index b0e5bda5585..cb3526e9b6c 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -6,7 +6,6 @@ === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 7f6c3a89bd3..e706e1d0870 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -6,7 +6,6 @@ === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index e31643106bd..8553dda7b3c 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -8,19 +8,18 @@ Deploying resources... Exit code: [KILLED] === WAL after first crash === WAL exists -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === Second deploy (crashes during job_a update) === >>> errcode [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after second crash === +WAL still exists === Third deploy (should succeed) === >>> [CLI] bundle deploy --force-lock diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 3f5747ab212..634f804e17c 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -8,24 +8,13 @@ Deploying resources... Exit code: [KILLED] === State directory contents after crash === deployment.json -resources.json resources.json.wal sync-snapshots === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash === -{ - "serial": [SERIAL], - "state_keys": [] -} -=== Bundle summary (should show job_a from WAL) === +cat: .databricks/bundle/default/resources.json: No such file or directory ->>> [CLI] bundle summary -o json -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. -{ - "job_a_id": "1001", - "job_b_id": null -} +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index f686ac48369..8f52732d3e9 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,8 +9,6 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/migrate/basic/out.plan_update.json b/acceptance/bundle/migrate/basic/out.plan_update.json index 44ba986a2f6..99e22ec08b7 100644 --- a/acceptance/bundle/migrate/basic/out.plan_update.json +++ b/acceptance/bundle/migrate/basic/out.plan_update.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 8, + "serial": 6, "plan": { "resources.jobs.test_job": { "action": "update", diff --git a/acceptance/bundle/migrate/basic/output.txt b/acceptance/bundle/migrate/basic/output.txt index 0d31bbd682f..dafa3a4086e 100644 --- a/acceptance/bundle/migrate/basic/output.txt +++ b/acceptance/bundle/migrate/basic/output.txt @@ -39,7 +39,7 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=7 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE=direct [CLI] bundle plan Plan: 0 to add, 0 to change, 0 to delete, 3 unchanged @@ -86,14 +86,14 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=8 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +resources.json: remote direct state serial=6 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" === Extra plan: should have no drift >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle plan diff --git a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json index 0f73ce72be1..6b55f64bd8d 100644 --- a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json +++ b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 4, + "serial": 3, "plan": { "resources.dashboards.dashboard1": { "action": "skip", diff --git a/acceptance/bundle/migrate/dashboards/output.txt b/acceptance/bundle/migrate/dashboards/output.txt index 7cbd91a2f68..19a4f1c7bb5 100644 --- a/acceptance/bundle/migrate/dashboards/output.txt +++ b/acceptance/bundle/migrate/dashboards/output.txt @@ -47,11 +47,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=5 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +resources.json: remote direct state serial=3 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" diff --git a/acceptance/bundle/migrate/grants/output.txt b/acceptance/bundle/migrate/grants/output.txt index 44ec67fb48a..146787d549a 100644 --- a/acceptance/bundle/migrate/grants/output.txt +++ b/acceptance/bundle/migrate/grants/output.txt @@ -45,11 +45,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=11 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +resources.json: remote direct state serial=9 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" diff --git a/acceptance/bundle/migrate/permissions/output.txt b/acceptance/bundle/migrate/permissions/output.txt index 953a4bae979..f85c8d7bdbf 100644 --- a/acceptance/bundle/migrate/permissions/output.txt +++ b/acceptance/bundle/migrate/permissions/output.txt @@ -62,11 +62,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" >>> [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" >>> [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=8 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +resources.json: remote direct state serial=7 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" diff --git a/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt b/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt index 84918b848bf..705bd09cb32 100644 --- a/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt +++ b/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt @@ -9,6 +9,5 @@ HTTP Status: 400 Bad Request API error_code: RESOURCE_DOES_NOT_EXIST API message: Warehouse doesnotexist does not exist -Updating deployment state... Exit code: 1 diff --git a/acceptance/bundle/resources/jobs/create-error/output.txt b/acceptance/bundle/resources/jobs/create-error/output.txt index 0fcd944efd2..4211f239d91 100644 --- a/acceptance/bundle/resources/jobs/create-error/output.txt +++ b/acceptance/bundle/resources/jobs/create-error/output.txt @@ -9,4 +9,3 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Shared job cluster feature is only supported in multi-task jobs. -Updating deployment state... diff --git a/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json b/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json index bdb8e9f5e99..7bf628435bb 100644 --- a/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json +++ b/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 2, + "serial": 1, "plan": { "resources.jobs.foo": { "action": "update", diff --git a/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt b/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt index 79d1f7200e1..8103b944c46 100644 --- a/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt +++ b/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt @@ -11,4 +11,3 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Field 'project_id' is required, expected non-default value (not "")! -Updating deployment state... diff --git a/acceptance/bundle/state/state_present/output.txt b/acceptance/bundle/state/state_present/output.txt index 706b54a67a0..cccf089828c 100644 --- a/acceptance/bundle/state/state_present/output.txt +++ b/acceptance/bundle/state/state_present/output.txt @@ -91,14 +91,14 @@ Deployment complete! >>> print_state.py 3 -15 +13 contains error: '12' not found in the output. >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states [TEST_TMP_DIR]/.databricks/bundle/default/terraform/terraform.tfstate: local terraform state serial=3 lineage="test-lineage" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=15 lineage="test-lineage" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=13 lineage="test-lineage" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull [TEST_TMP_DIR]/.databricks/bundle/default/terraform/terraform.tfstate: local terraform state serial=3 lineage="test-lineage" -resources.json: remote direct state serial=15 lineage="test-lineage" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=15 lineage="test-lineage" +resources.json: remote direct state serial=13 lineage="test-lineage" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=13 lineage="test-lineage" diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 90f8ca07fc8..55e4b58d7f9 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -2,6 +2,7 @@ package dstate import ( "bufio" + "bytes" "context" "encoding/json" "errors" @@ -15,6 +16,7 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) @@ -22,9 +24,13 @@ const ( currentStateVersion = 2 initialBufferSize = 64 * 1024 maxWalEntrySize = 1024 * 1024 - walSuffix = ".WAL" + walSuffix = ".wal" ) +// errStaleWAL is returned when the WAL serial is behind the expected serial. +// The caller should delete the stale WAL and proceed normally. +var errStaleWAL = errors.New("stale WAL") + type DeploymentState struct { Path string Data Database @@ -157,9 +163,8 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W _, walError := os.Stat(walPath) if walError == nil { if withRecovery { - err := db.replayWAL(ctx) - if err != nil { - return err + if err := db.replayWAL(ctx); err != nil { + return fmt.Errorf("reading state from %s: %w", path, err) } } else { return fmt.Errorf("unexpected WAL file found at %s", walPath) @@ -171,6 +176,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } if withWrite { + if err := os.MkdirAll(filepath.Dir(walPath), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) @@ -218,35 +226,35 @@ func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix hasEntries, err := db.mergeWalIntoState(ctx) if err != nil { - return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) + if errors.Is(err, errStaleWAL) { + log.Debugf(ctx, "Deleting stale WAL file %s", walPath) + _ = os.Remove(walPath) + return nil + } + return fmt.Errorf("WAL recovery failed: %w", err) } if hasEntries { if err := db.unlockedSave(); err != nil { return err } } - err = os.Remove(walPath) - if err != nil { + if err := os.Remove(walPath); err != nil { return fmt.Errorf("failed to remove WAL file %s: %w", walPath, err) } return nil } -func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHeader) error { - if header.CLIVersion != db.Data.CLIVersion { - return fmt.Errorf("cli_version in the header (%q) does not match the one in the state (%q)", header.CLIVersion, db.Data.CLIVersion) - } - - if header.StateVersion != db.Data.StateVersion { - return fmt.Errorf("state_version in the header (%d) does not match the one in the state (%d)", header.StateVersion, db.Data.StateVersion) - } - +func (db *DeploymentState) validateWALHeader(header *WALHeader) error { if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { - return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) + return fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Data.Lineage) } - if header.Serial != db.Data.Serial+1 { - return fmt.Errorf("serial in the header (%d) is not one higher than the one in the state (%d)", header.Serial, db.Data.Serial) + expected := db.Data.Serial + 1 + if header.Serial < expected { + return errStaleWAL + } + if header.Serial > expected { + return fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expected) } return nil @@ -267,6 +275,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) scanner := bufio.NewScanner(walFile) scanner.Buffer(make([]byte, 0, initialBufferSize), maxWalEntrySize) lineNumber := 0 + var corruptedLines [][]byte for scanner.Scan() { lineNumber++ @@ -276,7 +285,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) if err := json.Unmarshal(line, &header); err != nil { return false, fmt.Errorf("failed to parse WAL header: %w", err) } - if err := db.validateWALHeader(ctx, &header); err != nil { + if err := db.validateWALHeader(&header); err != nil { return false, err } // Apply header metadata to state (lineage may be new for first deploy) @@ -285,17 +294,38 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return false, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + log.Warnf(ctx, "Skipping corrupted WAL entry at %s:%d: %v", walPath, lineNumber, err) + corruptedLines = append(corruptedLines, append([]byte(nil), line...)) + continue + } + if db.Data.State == nil { + db.Data.State = make(map[string]ResourceEntry) } if entry.Value == nil { delete(db.Data.State, entry.Key) + delete(db.stateIDs, entry.Key) } else { db.Data.State[entry.Key] = *entry.Value + db.stateIDs[entry.Key] = entry.Value.ID } } } - return lineNumber > 1, scanner.Err() + if err := scanner.Err(); err != nil { + return false, err + } + + if len(corruptedLines) > 0 { + corruptedPath := walPath + ".corrupted" + corruptedData := bytes.Join(corruptedLines, []byte("\n")) + if writeErr := os.WriteFile(corruptedPath, corruptedData, 0o600); writeErr != nil { + log.Warnf(ctx, "Failed to save corrupted WAL entries to %s: %v", corruptedPath, writeErr) + } else { + log.Warnf(ctx, "Saved %d corrupted WAL entries to %s", len(corruptedLines), corruptedPath) + } + } + + return lineNumber > 1, nil } // Finalize replays the WAL (if open for write) and resets the state. @@ -337,6 +367,9 @@ func (db *DeploymentState) UpgradeToWrite() error { } walPath := db.Path + walSuffix + if err := os.MkdirAll(filepath.Dir(walPath), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) From 74d192fdc04ed8255361537006a7aa9eaf79ffa3 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 17:13:26 +0200 Subject: [PATCH 31/80] restore non-material changes: assertions and comment Co-authored-by: Denis Bilenko --- bundle/direct/bundle_plan.go | 2 ++ bundle/direct/pkg.go | 1 + cmd/bundle/utils/process.go | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 4f21d0fa066..eb80f49b687 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -44,6 +44,8 @@ func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan. return nil } + stateDB.AssertOpenedForReadOrWrite() + if plan.Lineage != stateDB.Data.Lineage { return fmt.Errorf("plan lineage %q does not match state lineage %q; the state may have been modified by another process", plan.Lineage, stateDB.Data.Lineage) } diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 50beda36f59..48a9c5a2ff7 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -67,5 +67,6 @@ func (d *DeploymentUnit) SetRemoteState(remoteState any) error { // ExportState exports the current deployment state as a resource map. // StateDB must already be open for read before calling this function. func (b *DeploymentBundle) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { + b.StateDB.AssertOpenedForRead() return b.StateDB.ExportState(ctx) } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index f7663057c2d..c142f4d943e 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -72,7 +72,8 @@ type ProcessOptions struct { // When set, skips Build and PreDeployChecks phases, loads plan from file instead of calculating. ReadPlanPath string - // PostStateFunc is called at the end of ProcessBundleRet, while state is still open. + // PostStateFunc is called at the end of ProcessBundleRet, within the state lifecycle scope + // (after state is opened and IDs loaded, before deferred Finalize). PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error // Indicate whether the bundle operation originates from the pipelines CLI From d041c19a0b9f731a1d948c018986e64813fc7864 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 1 May 2026 16:04:10 +0200 Subject: [PATCH 32/80] deduplicate UpgradeToWrite+defer Finalize in Deploy Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 44 +++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index a11fe4bc12b..fd312be8459 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -155,40 +155,32 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - if plan != nil { - if engine.IsDirect() { - // Upgrade from read (opened by process.go) to write mode - if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + planFromFile := plan != nil + if plan == nil { + // State is already open for read by process.go (for direct engine) + plan = RunPlan(ctx, b, engine) + } + + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) - return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() - } + }() + } + + if planFromFile { // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) if err != nil { logdiag.LogError(ctx, err) return } - } else { - // State is already open for read by process.go (for direct engine) - plan = RunPlan(ctx, b, engine) - if engine.IsDirect() { - // Upgrade from read to write mode (Apply needs write access) - if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { - logdiag.LogError(ctx, err) - return - } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() - } } if logdiag.HasError(ctx) { From baffcb519143f477b1dd90e55277c1bc62d4807d Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 1 May 2026 16:12:30 +0200 Subject: [PATCH 33/80] update out.test.toml --- acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml | 4 +--- .../bundle/deploy/wal/corrupted-wal-entry/out.test.toml | 4 +--- .../bundle/deploy/wal/corrupted-wal-middle/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/crash-after-create/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/empty-wal/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/normal-deploy/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/stale-wal/out.test.toml | 4 +--- .../bundle/deploy/wal/summary-after-crash/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml | 4 +--- acceptance/selftest/kill_caller/offset/out.test.toml | 4 +--- 13 files changed, 13 insertions(+), 39 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml +++ b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml index 54146af5645..e90b6d5d1ba 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/selftest/kill_caller/offset/out.test.toml b/acceptance/selftest/kill_caller/offset/out.test.toml index d560f1de043..f784a183258 100644 --- a/acceptance/selftest/kill_caller/offset/out.test.toml +++ b/acceptance/selftest/kill_caller/offset/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] From 58679995686708cdb4e32f2a72f29a285bee3ebb Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 4 May 2026 14:00:34 +0200 Subject: [PATCH 34/80] fix compilation in configsync/variables.go --- bundle/configsync/variables.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle/configsync/variables.go b/bundle/configsync/variables.go index e7bdff3696d..0745bfba43b 100644 --- a/bundle/configsync/variables.go +++ b/bundle/configsync/variables.go @@ -144,7 +144,7 @@ func resourceIDLookup(ctx context.Context, b *bundle.Bundle) func(string) string } _, statePath := b.StateFilenameConfigSnapshot(ctx) db := &dstate.DeploymentState{} - if err := db.Open(statePath); err != nil { + if err := db.Open(ctx, statePath, dstate.WithRecovery(false), dstate.WithWrite(false)); err != nil { log.Debugf(ctx, "variable restoration: failed to open state DB at %s: %v", statePath, err) return nil } From 19c0bf929a4861a50a9b0c8881820cd8ee166559 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 6 May 2026 11:28:56 +0200 Subject: [PATCH 35/80] use OpenWithData+UpgradeToWrite in migrate to avoid disk roundtrip CalculatePlan only reads StateDB.Data from memory; writing to disk before it and reading back via Open was unnecessary. Add OpenWithData to initialize state from an in-memory Database without disk I/O, then use UpgradeToWrite to transition to write mode before Apply. Co-authored-by: Isaac --- bundle/direct/dstate/state.go | 18 ++++++++++++++++++ cmd/bundle/deployment/migrate.go | 24 +++--------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 55e4b58d7f9..6a7e73778be 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -201,6 +201,24 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W return nil } +// OpenWithData initializes the state from an in-memory database without reading from disk. +// The state is opened in read mode; call UpgradeToWrite to transition to write mode. +func (db *DeploymentState) OpenWithData(path string, data Database) { + db.mu.Lock() + defer db.mu.Unlock() + + if db.Path != "" { + panic(fmt.Sprintf("state already opened: %v, cannot open %v", db.Path, path)) + } + + db.Path = path + db.Data = data + db.stateIDs = make(map[string]string) + for key, entry := range data.State { + db.stateIDs[key] = entry.ID + } +} + func (db *DeploymentState) Reload(ctx context.Context) error { db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 3b1f003f599..f4512f4e1f0 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -8,7 +8,6 @@ import ( "fmt" "os" "os/exec" - "path/filepath" "strings" "github.com/databricks/cli/bundle" @@ -228,19 +227,8 @@ To start using direct engine, set "engine: direct" under bundle in your databric migratedDB := dstate.NewDatabase(stateDesc.Lineage, stateDesc.Serial+1) migratedDB.State = state - // Write the migrated state to disk so CalculatePlan can read it via Open. - migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") - if err != nil { - return fmt.Errorf("marshaling migrated state: %w", err) - } - if err := os.MkdirAll(filepath.Dir(tempStatePath), 0o755); err != nil { - return fmt.Errorf("creating state directory: %w", err) - } - if err := os.WriteFile(tempStatePath, migratedStateJSON, 0o600); err != nil { - return fmt.Errorf("writing migrated state to %s: %w", tempStatePath, err) - } - deploymentBundle := &direct.DeploymentBundle{} + deploymentBundle.StateDB.OpenWithData(tempStatePath, migratedDB) tempStatePathAutoRemove := true @@ -258,10 +246,6 @@ To start using direct engine, set "engine: direct" under bundle in your databric return root.ErrAlreadyPrinted } - if err := deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return fmt.Errorf("failed to open state: %w", err) - } - plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return err @@ -293,10 +277,8 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - _ = deploymentBundle.StateDB.Finalize(ctx) - err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - return fmt.Errorf("reopening state for apply: %w", err) + if err := deploymentBundle.StateDB.UpgradeToWrite(); err != nil { + return fmt.Errorf("upgrading state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) From 2b294b85a3b3bb67b75665bf0987321628e68275 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 7 May 2026 11:45:21 +0200 Subject: [PATCH 36/80] use OpenWithData+UpgradeToWrite in uploadStateForYamlSync Same simplification as migrate.go: CalculatePlan reads StateDB.Data from memory, so writing to disk and reading back via Open is unnecessary. Use OpenWithData to initialize state in-memory, UpgradeToWrite to transition to write mode before Apply. Co-authored-by: Isaac --- .../statemgmt/upload_state_for_yaml_sync.go | 23 +++---------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index a89433964c5..5b1fbc3bf67 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -141,16 +141,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun migratedDB := dstate.NewDatabase(tfState.Lineage, tfState.Serial+1) migratedDB.State = state - // Write the migrated state to disk so CalculatePlan can read it via Open. - migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") - if err != nil { - return false, fmt.Errorf("marshaling migrated state: %w", err) - } - if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { - return false, fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err) - } - deploymentBundle := &direct.DeploymentBundle{} + deploymentBundle.StateDB.OpenWithData(snapshotPath, migratedDB) // Apply SecretScopeFixups so the config matches what the direct engine expects. // This adds MANAGE ACL for the current user to all secret scopes, ensuring @@ -177,12 +169,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun return false, fmt.Errorf("failed to create uninterpolated config: %w", err) } - if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return false, fmt.Errorf("failed to open state: %w", err) - } - plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) - _ = deploymentBundle.StateDB.Finalize(ctx) if err != nil { return false, err } @@ -206,12 +193,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } - // Close read state and reopen for write so Apply can record state changes via WAL. - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { - return false, fmt.Errorf("closing state after plan: %w", err) - } - if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { - return false, fmt.Errorf("reopening state for apply: %w", err) + if err := deploymentBundle.StateDB.UpgradeToWrite(); err != nil { + return false, fmt.Errorf("upgrading state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) From bcf5d31c7d9bc27a579ffea818ffa3f55b93fbd1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:00:48 +0200 Subject: [PATCH 37/80] remove redundant defer Finalize in Deploy WAL is recovered on next run via WithRecovery open in process.go; deployCore already calls Finalize+Open explicitly before PushResourcesState. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index fd312be8459..7efe71b8507 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -167,11 +167,6 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand logdiag.LogError(ctx, err) return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() } if planFromFile { From ffa5c05591d27cff68c9f5838e7a3acafabc99b8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:08:55 +0200 Subject: [PATCH 38/80] move Finalize into destroyCore before files.Delete Flush WAL to local state while the state DB is still open, before remote files are deleted. Co-authored-by: Denis Bilenko --- bundle/phases/destroy.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 3721f6a8835..68657f4e519 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -81,6 +81,13 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e bundle.ApplyContext(ctx, b, terraform.Apply()) } + // Flush WAL to local state file before deleting remote files. + if engine.IsDirect() { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { + logdiag.LogError(ctx, err) + } + } + if logdiag.HasError(ctx) { return } @@ -168,11 +175,6 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { logdiag.LogError(ctx, err) return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() } destroyCore(ctx, b, plan, engine) } else { From 10d5e68e52215cb6a616a13acaacb07c57a72c47 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:13:13 +0200 Subject: [PATCH 39/80] remove noise comment from bundle_apply.go Co-authored-by: Denis Bilenko --- bundle/direct/bundle_apply.go | 1 - 1 file changed, 1 deletion(-) diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 6b84f40775f..9bf0f857a5f 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -152,7 +152,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) - // Note: caller is responsible for closing StateDB after Apply returns. } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { From c7f54e81e69757154de65f683f084c9698a8953d Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 19:55:59 +0200 Subject: [PATCH 40/80] fix gofumpt and test output Co-authored-by: Denis Bilenko --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ bundle/direct/bundle_apply.go | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index 03407dd0d8a..cb87595a2c2 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 5 done - success (past kill window) diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 9bf0f857a5f..6bad8091469 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -151,7 +151,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) - } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { From 57ad5710b2ad3d56bc5cf875e7e1ffbd3a2d2052 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 19:59:02 +0200 Subject: [PATCH 41/80] shrink chain-10-jobs to chain-3-jobs 3 jobs exercise the same DAG + partial-WAL recovery path with 3x fewer output lines. Co-authored-by: Denis Bilenko --- .../deploy/wal/chain-10-jobs/databricks.yml | 117 ----- .../deploy/wal/chain-10-jobs/output.txt | 414 ------------------ .../deploy/wal/chain-3-jobs/databricks.yml | 40 ++ .../out.test.toml | 0 .../bundle/deploy/wal/chain-3-jobs/output.txt | 120 +++++ .../{chain-10-jobs => chain-3-jobs}/script | 0 .../{chain-10-jobs => chain-3-jobs}/test.py | 0 .../{chain-10-jobs => chain-3-jobs}/test.toml | 6 +- 8 files changed, 163 insertions(+), 534 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/output.txt create mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/out.test.toml (100%) create mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/output.txt rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/script (100%) rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/test.py (100%) rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/test.toml (69%) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml deleted file mode 100644 index 2652cdbed62..00000000000 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml +++ /dev/null @@ -1,117 +0,0 @@ -bundle: - name: wal-chain-test - -resources: - jobs: - # Linear chain: job_01 -> job_02 -> ... -> job_10 - # Execution order: job_01 first, job_10 last - job_01: - name: "job-01" - description: "first in chain" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_02: - name: "job-02" - description: "depends on ${resources.jobs.job_01.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_03: - name: "job-03" - description: "depends on ${resources.jobs.job_02.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_04: - name: "job-04" - description: "depends on ${resources.jobs.job_03.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_05: - name: "job-05" - description: "depends on ${resources.jobs.job_04.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_06: - name: "job-06" - description: "depends on ${resources.jobs.job_05.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_07: - name: "job-07" - description: "depends on ${resources.jobs.job_06.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_08: - name: "job-08" - description: "depends on ${resources.jobs.job_07.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_09: - name: "job-09" - description: "depends on ${resources.jobs.job_08.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_10: - name: "job-10" - description: "depends on ${resources.jobs.job_09.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt deleted file mode 100644 index 818bf13b251..00000000000 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ /dev/null @@ -1,414 +0,0 @@ -=== First deploy (crashes on job_10) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] - -=== WAL content after crash === -{ - "cli_version": "[DEV_VERSION]", - "lineage": "[UUID]", - "serial": [SERIAL], - "state_version": 2 -} -{ - "k": "resources.jobs.job_01", - "v": { - "__id__": "[ID]", - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "first in chain", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-01", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_02", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_01.id}", - "node": "resources.jobs.job_01" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-02", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_03", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_02.id}", - "node": "resources.jobs.job_02" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-03", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_04", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_03.id}", - "node": "resources.jobs.job_03" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-04", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_05", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_04.id}", - "node": "resources.jobs.job_04" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-05", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_06", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_05.id}", - "node": "resources.jobs.job_05" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-06", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_07", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_06.id}", - "node": "resources.jobs.job_06" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-07", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_08", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_07.id}", - "node": "resources.jobs.job_07" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-08", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_09", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_08.id}", - "node": "resources.jobs.job_08" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-09", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} - -=== Number of jobs saved in WAL === -9 - -=== Bundle summary (reads from WAL) === -Name: wal-chain-test -Target: default -Workspace: - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default -Resources: - Jobs: - job_01: - Name: job-01 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_02: - Name: job-02 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_03: - Name: job-03 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_04: - Name: job-04 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_05: - Name: job-05 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_06: - Name: job-06 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_07: - Name: job-07 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_08: - Name: job-08 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_09: - Name: job-09 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_10: - Name: job-10 - URL: (not deployed) - -=== Second deploy (recovery) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! - -=== WAL after successful deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml new file mode 100644 index 00000000000..fc3a46205bc --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml @@ -0,0 +1,40 @@ +bundle: + name: wal-chain-test + +resources: + jobs: + # Linear chain: job_01 -> job_02 -> job_03 + # Execution order: job_01 first, job_03 last + job_01: + name: "job-01" + description: "first in chain" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_02: + name: "job-02" + description: "depends on ${resources.jobs.job_01.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_03: + name: "job-03" + description: "depends on ${resources.jobs.job_02.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml rename to acceptance/bundle/deploy/wal/chain-3-jobs/out.test.toml diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt new file mode 100644 index 00000000000..ef56c8e0981 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -0,0 +1,120 @@ +=== First deploy (crashes on job_10) === + +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +=== WAL content after crash === +{ + "cli_version": "[DEV_VERSION]", + "lineage": "[UUID]", + "serial": [SERIAL], + "state_version": 2 +} +{ + "k": "resources.jobs.job_01", + "v": { + "__id__": "[ID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "first in chain", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-01", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_02", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_01.id}", + "node": "resources.jobs.job_01" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-02", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} + +=== Number of jobs saved in WAL === +2 + +=== Bundle summary (reads from WAL) === +Name: wal-chain-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default +Resources: + Jobs: + job_01: + Name: job-01 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_02: + Name: job-02 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_03: + Name: job-03 + URL: (not deployed) + +=== Second deploy (recovery) === + +>>> [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/script rename to acceptance/bundle/deploy/wal/chain-3-jobs/script diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.py b/acceptance/bundle/deploy/wal/chain-3-jobs/test.py similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/test.py rename to acceptance/bundle/deploy/wal/chain-3-jobs/test.py diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml similarity index 69% rename from acceptance/bundle/deploy/wal/chain-10-jobs/test.toml rename to acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 36076f3df5e..2425c89deae 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -1,9 +1,9 @@ -# Linear chain: job_01 -> job_02 -> ... -> job_10 -# Let first 9 jobs/create succeed, then kill on the 10th +# Linear chain: job_01 -> job_02 -> job_03 +# Let first 2 jobs/create succeed, then kill on the 3rd [[Server]] Pattern = "POST /api/2.2/jobs/create" -KillCallerOffset = 9 +KillCallerOffset = 2 KillCaller = 1 Response.Body = '{"job_id": 1001}' From 3bd8efefa2bfdd5a6b6d672de3ac25e4cce9cb4a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:00:26 +0200 Subject: [PATCH 42/80] fix test names in state_test.go: Close -> Finalize, restore SaveFinalize Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index b493258b2f2..3f0f614cd3f 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -9,7 +9,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestOpenCloseRoundTrip(t *testing.T) { +func TestOpenSaveFinalizeRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState @@ -26,7 +26,7 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db2.Finalize(t.Context())) } -func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { +func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState From 870d434974d4fb91755272152c214f03445c3e9c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:03:29 +0200 Subject: [PATCH 43/80] clean up WAL acceptance tests - drop corrupted-wal-middle (same code path as corrupted-wal-entry) - drop multiple-crashes (covered by crash-after-create) - drop summary-after-crash (incomplete output; crash coverage in crash-after-create) - fix empty-wal echo: (unexpected) -> (expected) - fix parent test.toml: exit code 137 -> [KILLED] only; errors show Exit code: 1 Co-authored-by: Denis Bilenko --- .../wal/corrupted-wal-middle/databricks.yml | 25 ----------- .../wal/corrupted-wal-middle/out.test.toml | 3 -- .../wal/corrupted-wal-middle/output.txt | 27 ------------ .../deploy/wal/corrupted-wal-middle/script | 43 ------------------- .../deploy/wal/corrupted-wal-middle/test.py | 1 - .../deploy/wal/corrupted-wal-middle/test.toml | 10 ----- .../bundle/deploy/wal/empty-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/script | 2 +- .../deploy/wal/future-serial-wal/output.txt | 2 +- .../deploy/wal/lineage-mismatch/output.txt | 2 +- .../wal/multiple-crashes/databricks.yml | 27 ------------ .../deploy/wal/multiple-crashes/out.test.toml | 3 -- .../deploy/wal/multiple-crashes/output.txt | 39 ----------------- .../bundle/deploy/wal/multiple-crashes/script | 29 ------------- .../deploy/wal/multiple-crashes/test.py | 1 - .../deploy/wal/multiple-crashes/test.toml | 18 -------- .../wal/summary-after-crash/databricks.yml | 27 ------------ .../wal/summary-after-crash/out.test.toml | 3 -- .../deploy/wal/summary-after-crash/output.txt | 20 --------- .../deploy/wal/summary-after-crash/script | 19 -------- .../deploy/wal/summary-after-crash/test.py | 1 - .../deploy/wal/summary-after-crash/test.toml | 13 ------ acceptance/bundle/deploy/wal/test.toml | 2 +- 23 files changed, 5 insertions(+), 314 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/script delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/output.txt delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/script delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.py delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.toml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/output.txt delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/script delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.py delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.toml diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml deleted file mode 100644 index aef2c714ec7..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml +++ /dev/null @@ -1,25 +0,0 @@ -bundle: - name: wal-corrupted-middle-test - -resources: - jobs: - job_one: - name: "job-one" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_two: - name: "job-two" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml deleted file mode 100644 index e90b6d5d1ba..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt deleted file mode 100644 index ff13944ae4e..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ /dev/null @@ -1,27 +0,0 @@ -=== Creating state file with serial 5 === -=== Creating WAL with corrupted MIDDLE entry === -=== WAL content === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -{"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} -=== Deploy (should recover valid entries and skip corrupted line) === - ->>> [CLI] bundle deploy -Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:3: unexpected end of JSON input -Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Final state (should have recovered entries) === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_one", - "resources.jobs.job_two" - ] -} -=== Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script deleted file mode 100644 index 6307d7fbf73..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script +++ /dev/null @@ -1,43 +0,0 @@ -echo "=== Creating state file with serial 5 ===" -mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-456", - "serial": 5, - "state": {} -} -EOF - -echo "=== Creating WAL with corrupted MIDDLE entry ===" -# Corrupted middle line is expected (truncated JSON from crash) and should be skipped. -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-456","serial":6} -{"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- -{"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} -EOF - -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should recover valid entries and skip corrupted line) ===" -trace $CLI bundle deploy 2>&1 - -echo "=== Final state (should have recovered entries) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' - -echo "=== Corrupted WAL entries file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - cat .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Missing corrupted WAL entries file (unexpected)" -fi - -echo "=== WAL after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py deleted file mode 100644 index 1ff8e07c707..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml deleted file mode 100644 index d5f0b1bbb65..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ /dev/null @@ -1,10 +0,0 @@ -# WAL with corrupted MIDDLE entry - valid entries are recovered and corrupted entries are skipped. - -# Since valid entries are recovered, jobs will be updated (not created fresh). -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 9999}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=9999" -Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 26117a2d368..884f5027445 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -12,7 +12,7 @@ Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) === Corrupted WAL file === -Corrupted WAL file missing (unexpected) +Corrupted WAL file missing (expected) === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index 2c66d213aab..3929de8eb1f 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -21,7 +21,7 @@ echo "=== Corrupted WAL file ===" if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then ls -la .databricks/bundle/default/resources.json.wal.corrupted else - echo "Corrupted WAL file missing (unexpected)" + echo "Corrupted WAL file missing (expected)" fi echo "=== State file content ===" diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index cb3526e9b6c..2b93423e1b9 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -9,4 +9,4 @@ Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index e706e1d0870..a539a2fb0c3 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -9,4 +9,4 @@ Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml deleted file mode 100644 index 3dc96ed8560..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml +++ /dev/null @@ -1,27 +0,0 @@ -bundle: - name: wal-multi-crash-test - -resources: - jobs: - job_a: - name: "test-job-a" - description: "first job" - tasks: - - task_key: "task-a" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_b: - name: "test-job-b" - description: "depends on ${resources.jobs.job_a.id}" - tasks: - - task_key: "task-b" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml deleted file mode 100644 index e90b6d5d1ba..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt deleted file mode 100644 index 8553dda7b3c..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ /dev/null @@ -1,39 +0,0 @@ -=== First deploy (crashes after job_a create) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== WAL after first crash === -WAL exists -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== Second deploy (crashes during job_a update) === - ->>> errcode [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== WAL after second crash === -WAL still exists -=== Third deploy (should succeed) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Final state === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_a", - "resources.jobs.job_b" - ] -} -=== WAL after successful deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script deleted file mode 100644 index 0adcd2a980e..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/script +++ /dev/null @@ -1,29 +0,0 @@ -echo "=== First deploy (crashes after job_a create) ===" -trace errcode $CLI bundle deploy - -echo "=== WAL after first crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists" - cat .databricks/bundle/default/resources.json.wal -fi - -echo "=== Second deploy (crashes during job_a update) ===" -trace errcode $CLI bundle deploy --force-lock - -echo "=== WAL after second crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL still exists" -fi - -echo "=== Third deploy (should succeed) ===" -trace $CLI bundle deploy --force-lock - -echo "=== Final state ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.py b/acceptance/bundle/deploy/wal/multiple-crashes/test.py deleted file mode 100644 index 1ff8e07c707..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml deleted file mode 100644 index 474177b8046..00000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ /dev/null @@ -1,18 +0,0 @@ -# Multiple real crashes during deployment - WAL should persist until successful finalize. -# First deploy: crashes after job_a create (kill on jobs/get) -# Second deploy: crashes during job_a update (kill on jobs/reset) -# Third deploy: succeeds (both counters exhausted) - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -KillCaller = 1 -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml deleted file mode 100644 index 86376fd7baf..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml +++ /dev/null @@ -1,27 +0,0 @@ -bundle: - name: wal-summary-test - -resources: - jobs: - job_a: - name: "job-a" - description: "first job" - tasks: - - task_key: "task-a" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_b: - name: "job-b" - description: "depends on ${resources.jobs.job_a.id}" - tasks: - - task_key: "task-b" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml deleted file mode 100644 index e90b6d5d1ba..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt deleted file mode 100644 index 634f804e17c..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ /dev/null @@ -1,20 +0,0 @@ -=== Deploy (job_a created and saved, then crash on jobs/get) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== State directory contents after crash === -deployment.json -resources.json.wal -sync-snapshots -=== WAL should exist after crash === -WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== State file after crash === -cat: .databricks/bundle/default/resources.json: No such file or directory - -Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script deleted file mode 100644 index 3b007062c60..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/script +++ /dev/null @@ -1,19 +0,0 @@ -echo "=== Deploy (job_a created and saved, then crash on jobs/get) ===" -trace errcode $CLI bundle deploy - -echo "=== State directory contents after crash ===" -ls .databricks/bundle/default/ - -echo "=== WAL should exist after crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (expected)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL missing (unexpected)" -fi - -echo "=== State file after crash ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== Bundle summary (should show job_a from WAL) ===" -trace $CLI bundle summary -o json | jq '{job_a_id: .resources.jobs.job_a.id, job_b_id: .resources.jobs.job_b.id}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.py b/acceptance/bundle/deploy/wal/summary-after-crash/test.py deleted file mode 100644 index 1ff8e07c707..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml deleted file mode 100644 index f14cbbfcbc3..00000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ /dev/null @@ -1,13 +0,0 @@ -# Bundle summary should show resources recovered from WAL after a real crash. -# job_b depends on job_a, so after job_a is created and SaveState is called, -# refreshRemoteState calls jobs/get to fetch job_a's state for job_b's reference. -# We kill on jobs/get - AFTER job_a's SaveState, so WAL contains job_a. - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index df700645f7a..50b50dbcbaf 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -19,7 +19,7 @@ New = """${1}[PROCESS_KILLED] Exit code:""" [[Repls]] -Old = 'Exit code: (137|1)' +Old = 'Exit code: 137' New = 'Exit code: [KILLED]' # On Windows, no bash "Killed" message appears when CLI has produced output before termination. From 174803603e6b245e557247a0b24e4b4455745409 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:58:02 +0200 Subject: [PATCH 44/80] fix crash-after-create: handle Linux exit code 1 after KillCaller On Linux, KillCaller (SIGKILL) may produce exit code 1 instead of 137. Add a context-sensitive replacement to normalise exit code 1 only when it directly follows [PROCESS_KILLED], so genuine error exits (exit code 1 from cat/jq) remain visible as Exit code: 1 in the output. Co-authored-by: Denis Bilenko --- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/test.toml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index cf9230983c7..b3250b2db1a 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -13,4 +13,4 @@ WAL exists (expected) === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 50b50dbcbaf..266d748049c 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -22,6 +22,13 @@ Exit code:""" Old = 'Exit code: 137' New = 'Exit code: [KILLED]' +# On Linux, a KillCaller kill may surface as exit code 1 rather than 137. +# Only normalise exit code 1 when it directly follows [PROCESS_KILLED] to +# avoid masking genuine error exits (lineage-mismatch, future-serial-wal). +[[Repls]] +Old = '(\[PROCESS_KILLED\]\n\nExit code: )1' +New = '${1}[KILLED]' + # On Windows, no bash "Killed" message appears when CLI has produced output before termination. # Insert [PROCESS_KILLED] between last output line and exit code for consistency. [[Repls]] From 01a3610c68b571b841f3bf9484bfd7e486cfc8b5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 21:39:43 +0200 Subject: [PATCH 45/80] update selftest --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index cb87595a2c2..03407dd0d8a 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 5 done - success (past kill window) From 69dfc3ea2226616324b9ed0498848992dcc053b5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 10:41:27 +0200 Subject: [PATCH 46/80] fix WAL acceptance test hygiene - chain-3-jobs: fix stale echo "job_10" -> "job_03" - corrupted-wal-entry, future-serial-wal, lineage-mismatch, stale-wal, wal-with-delete: commit static fixture files (resources.json, resources.json.wal) instead of creating them inline in script; wal-with-delete: commit databricks.yml as resources: {} instead of overwriting it at runtime Co-authored-by: Denis Bilenko --- .../bundle/deploy/wal/chain-3-jobs/output.txt | 2 +- .../bundle/deploy/wal/chain-3-jobs/script | 2 +- .../deploy/wal/corrupted-wal-entry/output.txt | 2 -- .../wal/corrupted-wal-entry/resources.json | 7 ++++ .../corrupted-wal-entry/resources.json.wal | 4 +++ .../deploy/wal/corrupted-wal-entry/script | 22 ++----------- .../deploy/wal/future-serial-wal/output.txt | 2 -- .../wal/future-serial-wal/resources.json | 12 +++++++ .../wal/future-serial-wal/resources.json.wal | 2 ++ .../deploy/wal/future-serial-wal/script | 23 ++----------- .../deploy/wal/lineage-mismatch/output.txt | 2 -- .../wal/lineage-mismatch/resources.json | 12 +++++++ .../wal/lineage-mismatch/resources.json.wal | 2 ++ .../bundle/deploy/wal/lineage-mismatch/script | 23 ++----------- .../bundle/deploy/wal/stale-wal/output.txt | 3 -- .../deploy/wal/stale-wal/resources.json | 12 +++++++ .../deploy/wal/stale-wal/resources.json.wal | 2 ++ acceptance/bundle/deploy/wal/stale-wal/script | 25 ++------------ .../deploy/wal/wal-with-delete/databricks.yml | 13 +------- .../deploy/wal/wal-with-delete/output.txt | 4 --- .../deploy/wal/wal-with-delete/resources.json | 12 +++++++ .../wal/wal-with-delete/resources.json.wal | 2 ++ .../bundle/deploy/wal/wal-with-delete/script | 33 ++----------------- 23 files changed, 80 insertions(+), 143 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/resources.json create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/resources.json create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/stale-wal/resources.json create mode 100644 acceptance/bundle/deploy/wal/stale-wal/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/resources.json create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index ef56c8e0981..1f4b53f7cf8 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -1,4 +1,4 @@ -=== First deploy (crashes on job_10) === +=== First deploy (crashes on job_03) === >>> errcode [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 1f829232ad9..6c9993c2802 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,4 +1,4 @@ -echo "=== First deploy (crashes on job_10) ===" +echo "=== First deploy (crashes on job_03) ===" trace errcode $CLI bundle deploy echo "" diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index aad802f749b..bd886c153fe 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,5 +1,3 @@ -=== Creating state file with serial 5 === -=== Creating WAL with corrupted LAST entry === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json new file mode 100644 index 00000000000..f9f4e54d1ed --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json @@ -0,0 +1,7 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 5, + "state": {} +} diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal new file mode 100644 index 00000000000..4791ba12814 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal @@ -0,0 +1,4 @@ +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index dde17995da6..191a62f01fc 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -1,24 +1,6 @@ -echo "=== Creating state file with serial 5 ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-123", - "serial": 5, - "state": {} -} -EOF - -echo "=== Creating WAL with corrupted LAST entry ===" -# Corrupted last line is expected (truncated JSON from crash) and should be skipped. -# Valid entries before it should be recovered. -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index 2b93423e1b9..8fc16565fe3 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,5 +1,3 @@ -=== Creating state file (serial=2) === -=== Creating WAL with future serial (serial=5, expected=3) === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/resources.json b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json new file mode 100644 index 00000000000..f2f06b34bf4 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal new file mode 100644 index 00000000000..98a8e48802b --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"test-lineage-123","serial":5} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/script b/acceptance/bundle/deploy/wal/future-serial-wal/script index 7b1784b0c69..f7a57192255 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/script +++ b/acceptance/bundle/deploy/wal/future-serial-wal/script @@ -1,25 +1,6 @@ -echo "=== Creating state file (serial=2) ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-123", - "serial": 2, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with future serial (serial=5, expected=3) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":5} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index a539a2fb0c3..f090a161637 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,5 +1,3 @@ -=== Creating state file with lineage-A === -=== Creating WAL with lineage-B (mismatch) === === WAL content === {"lineage":"wal-lineage-bbb","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json new file mode 100644 index 00000000000..444a9ea888d --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "state-lineage-aaa", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal new file mode 100644 index 00000000000..d14fb4a9713 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"wal-lineage-bbb","serial":2} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script index b241246e6c9..4617c338fe7 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/script +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -1,25 +1,6 @@ -echo "=== Creating state file with lineage-A ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "state-lineage-aaa", - "serial": 1, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with lineage-B (mismatch) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"wal-lineage-bbb","serial":2} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index 682534de7ce..a2066ccdd8f 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,6 +1,3 @@ -=== Creating state directory === -=== Creating state file (serial=2) === -=== Creating stale WAL with old serial (serial=1) === === WAL content before deploy === {"lineage":"stale-test-lineage","serial": [SERIAL]} {"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} diff --git a/acceptance/bundle/deploy/wal/stale-wal/resources.json b/acceptance/bundle/deploy/wal/stale-wal/resources.json new file mode 100644 index 00000000000..6fd38b67ae8 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "stale-test-lineage", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal b/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal new file mode 100644 index 00000000000..ef5f380ed84 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"stale-test-lineage","serial":1} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} diff --git a/acceptance/bundle/deploy/wal/stale-wal/script b/acceptance/bundle/deploy/wal/stale-wal/script index d814639a00e..4de1bc1e921 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/script +++ b/acceptance/bundle/deploy/wal/stale-wal/script @@ -1,27 +1,6 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating state file (serial=2) ===" -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "stale-test-lineage", - "serial": 2, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating stale WAL with old serial (serial=1) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"stale-test-lineage","serial":1} -{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content before deploy ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml index 457a2d3e964..128bbe37f56 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml @@ -1,15 +1,4 @@ bundle: name: wal-delete-test -resources: - jobs: - test_job: - name: "test-job" - tasks: - - task_key: "test-task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 +resources: {} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index 8f52732d3e9..a7960906d3f 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -1,10 +1,6 @@ -=== Creating state directory === -=== Creating state file (job exists) === -=== Creating WAL with delete entry (simulating crash during delete) === === WAL content === {"lineage":"delete-test-lineage","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":null} -=== Updating config to remove job === === Deploy (should recover delete from WAL) === >>> [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/resources.json b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json new file mode 100644 index 00000000000..04263ec36f9 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "delete-test-lineage", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal new file mode 100644 index 00000000000..9b5c6169e3f --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"delete-test-lineage","serial":2} +{"k":"resources.jobs.test_job","v":null} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script index f840355267c..5d5a78a885b 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/script +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -1,39 +1,10 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating state file (job exists) ===" -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "delete-test-lineage", - "serial": 1, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with delete entry (simulating crash during delete) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"delete-test-lineage","serial":2} -{"k":"resources.jobs.test_job","v":null} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Updating config to remove job ===" -cat > databricks.yml << 'EOF' -bundle: - name: wal-delete-test - -resources: {} -EOF - echo "=== Deploy (should recover delete from WAL) ===" trace $CLI bundle deploy From 97bd52c50c7bc82dde5ffdf6c57ad514e5f39a87 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:28:58 +0200 Subject: [PATCH 47/80] update test output after rebase --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index 03407dd0d8a..cb87595a2c2 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 5 done - success (past kill window) From db53353b90b5512cd1a6f82d35826def7455315f Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:36:16 +0200 Subject: [PATCH 48/80] destroyCore: warn on Finalize failure instead of aborting Resources are already deleted at this point; failing hard prevents the file-cleanup step from running. Downgrade to a warning so destroyCore continues to delete the remote files regardless. Co-authored-by: Denis Bilenko --- bundle/phases/destroy.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 68657f4e519..a130c7a1c69 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/databricks-sdk-go/apierr" @@ -82,9 +83,14 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e } // Flush WAL to local state file before deleting remote files. + // Warn instead of hard-error: resources are already deleted, so proceed + // with file cleanup regardless of whether state flush succeeds. if engine.IsDirect() { if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) + diags := diag.WarningFromErr(err) + if len(diags) > 0 { + logdiag.LogDiag(ctx, diags[0]) + } } } From 4ef7c16226438628cabf95d1a8cf872302ee9f73 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:37:07 +0200 Subject: [PATCH 49/80] update test output --- .../bundle/resources/apps/create_already_exists/output.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance/bundle/resources/apps/create_already_exists/output.txt b/acceptance/bundle/resources/apps/create_already_exists/output.txt index e4438d47b04..82deb4ab43a 100644 --- a/acceptance/bundle/resources/apps/create_already_exists/output.txt +++ b/acceptance/bundle/resources/apps/create_already_exists/output.txt @@ -37,7 +37,6 @@ HTTP Status: 409 Conflict API error_code: RESOURCE_ALREADY_EXISTS API message: An app with the same name already exists: test-app-already-exists -Updating deployment state... >>> [CLI] apps delete test-app-already-exists { From 47e11ad9cdb0d3089d15b86f06d8735854faf3a1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 12:44:19 +0200 Subject: [PATCH 50/80] deployCore: use Finalize return value instead of re-opening state Finalize now returns (ExportedResourcesMap, error), capturing the merged state before clearing. This lets deploy.go use LoadFromState directly instead of closing and re-opening the state file from disk. LoadFromState is a new statemgmt constructor that accepts pre-computed state and skips the engine dispatch in Load.Apply. Co-authored-by: Denis Bilenko --- bundle/direct/bind.go | 19 +++++--- bundle/direct/dstate/state.go | 46 ++++++++++++------- bundle/direct/dstate/state_test.go | 20 +++++--- bundle/phases/deploy.go | 22 +++++---- bundle/phases/destroy.go | 2 +- bundle/statemgmt/state_load.go | 37 +++++++++++---- .../statemgmt/upload_state_for_yaml_sync.go | 2 +- cmd/bundle/deployment/migrate.go | 2 +- 8 files changed, 100 insertions(+), 50 deletions(-) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 7e32bfd6479..c16f763afcc 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - _ = checkStateDB.Finalize(ctx) + if _, err := checkStateDB.Finalize(ctx); err != nil { + log.Warnf(ctx, "failed to finalize state: %v", err) + } if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -98,7 +100,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -117,7 +119,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - _ = b.StateDB.Finalize(ctx) + if _, err := b.StateDB.Finalize(ctx); err != nil { + log.Warnf(ctx, "failed to finalize state: %v", err) + } // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -152,7 +156,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -166,7 +170,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - _ = b.StateDB.Finalize(ctx) + if _, ferr := b.StateDB.Finalize(ctx); ferr != nil { + log.Warnf(ctx, "failed to finalize state: %v", ferr) + } if err != nil { os.Remove(tmpStatePath) return nil, err @@ -236,5 +242,6 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) + return err } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 6a7e73778be..7dc9b97f981 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -32,19 +32,25 @@ const ( var errStaleWAL = errors.New("stale WAL") type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - walFile *os.File + Path string + Data Database + mu sync.Mutex + walFile *os.File + + // Maps resource key to ID. Unlike Data.State, this is up to during writes (deploys). stateIDs map[string]string } type Database struct { - StateVersion int `json:"state_version"` - CLIVersion string `json:"cli_version"` - Lineage string `json:"lineage"` - Serial int `json:"serial"` - State map[string]ResourceEntry `json:"state"` + StateVersion int `json:"state_version"` + CLIVersion string `json:"cli_version"` + Lineage string `json:"lineage"` + Serial int `json:"serial"` + + // Maps resource key to ResourceEntry which includes ID + full serialized state. + // This is not updated during write/deploy, those writes go to WAL instead. + // The State is then reconstructed from WAL. + State map[string]ResourceEntry `json:"state"` } type ResourceEntry struct { @@ -346,14 +352,15 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) return lineNumber > 1, nil } -// Finalize replays the WAL (if open for write) and resets the state. +// Finalize replays the WAL (if open for write), captures the resulting state, and resets. // Safe to call multiple times or on an already-finalized state. -func (db *DeploymentState) Finalize(ctx context.Context) error { +// Returns the exported state as of the end of this operation. +func (db *DeploymentState) Finalize(ctx context.Context) (resourcestate.ExportedResourcesMap, error) { db.mu.Lock() defer db.mu.Unlock() if db.Path == "" { - return nil + return nil, nil } var err error @@ -364,11 +371,13 @@ func (db *DeploymentState) Finalize(ctx context.Context) error { err = db.replayWAL(ctx) } + state := ExportStateFromData(db.Data) + db.Path = "" db.Data = Database{} - db.stateIDs = make(map[string]string) + db.stateIDs = nil - return err + return state, err } // UpgradeToWrite transitions from read mode to write mode without re-reading state. @@ -427,9 +436,10 @@ func (db *DeploymentState) AssertOpenedForWrite() { } } -func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { +// ExportStateFromData extracts resource IDs and ETags from a database snapshot. +func ExportStateFromData(data Database) resourcestate.ExportedResourcesMap { result := make(resourcestate.ExportedResourcesMap) - for key, entry := range db.Data.State { + for key, entry := range data.State { var etag string // Extract etag for dashboards. // covered by test case: bundle/deploy/dashboard/detect-change @@ -450,6 +460,10 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export return result } +func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { + return ExportStateFromData(db.Data) +} + func (db *DeploymentState) unlockedSave() error { data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 3f0f614cd3f..afe8634790a 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -9,6 +9,12 @@ import ( "github.com/stretchr/testify/require" ) +func mustFinalize(t *testing.T, db *DeploymentState) { + t.Helper() + _, err := db.Finalize(t.Context()) + require.NoError(t, err) +} + func TestOpenSaveFinalizeRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") @@ -16,14 +22,14 @@ func TestOpenSaveFinalizeRoundTrip(t *testing.T) { require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) // Re-open and verify persisted data. var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) - require.NoError(t, db2.Finalize(t.Context())) + mustFinalize(t, &db2) } func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { @@ -31,7 +37,7 @@ func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) _, err := os.Stat(path) assert.ErrorIs(t, err, os.ErrNotExist) @@ -46,7 +52,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) } func TestDeleteState(t *testing.T) { @@ -55,16 +61,16 @@ func TestDeleteState(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db2.DeleteState("jobs.my_job")) - require.NoError(t, db2.Finalize(t.Context())) + mustFinalize(t, &db2) var db3 DeploymentState require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 2, db3.Data.Serial) assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) - require.NoError(t, db3.Finalize(t.Context())) + mustFinalize(t, &db3) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 7efe71b8507..e318fa1ffe6 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -15,7 +15,6 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/metrics" "github.com/databricks/cli/bundle/permissions" @@ -75,14 +74,12 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Close state to replay WAL into state file, then reopen for read. - // PushResourcesState needs the file on disk, Load needs the state in memory. + // Flush WAL to state file on disk; capture the resulting state for Load below. + var directState statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + var err error + directState, err = b.DeploymentBundle.StateDB.Finalize(ctx) + if err != nil { logdiag.LogError(ctx, err) } } @@ -93,8 +90,15 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } + var loadMutator bundle.Mutator + if targetEngine.IsDirect() { + loadMutator = statemgmt.LoadFromState(directState) + } else { + loadMutator = statemgmt.Load(targetEngine) + } + bundle.ApplySeqContext(ctx, b, - statemgmt.Load(targetEngine), + loadMutator, metadata.Compute(), metadata.Upload(), statemgmt.UploadStateForYamlSync(targetEngine), diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index a130c7a1c69..98e6f7fee2a 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -86,7 +86,7 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e // Warn instead of hard-error: resources are already deleted, so proceed // with file cleanup regardless of whether state flush succeeds. if engine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { diags := diag.WarningFromErr(err) if len(diags) > 0 { logdiag.LogDiag(ctx, diags[0]) diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index 3345792c295..4894fc08a67 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -35,7 +35,6 @@ func (l *load) Name() string { } func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - var err error var state ExportedResourcesMap if l.engine.IsDirect() { @@ -48,14 +47,29 @@ func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { } } - err = l.validateState(state) - if err != nil { + return applyState(ctx, b, state, l.modes) +} + +type loadFromState struct { + state ExportedResourcesMap + modes []LoadMode +} + +func (l *loadFromState) Name() string { + return "statemgmt.Load" +} + +func (l *loadFromState) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + return applyState(ctx, b, l.state, l.modes) +} + +// applyState merges the exported resource state into the bundle configuration. +func applyState(ctx context.Context, b *bundle.Bundle, state ExportedResourcesMap, modes []LoadMode) diag.Diagnostics { + if err := validateLoadedState(state, modes); err != nil { return diag.FromErr(err) } - // Merge state into configuration. - err = StateToBundle(ctx, state, &b.Config) - if err != nil { + if err := StateToBundle(ctx, state, &b.Config); err != nil { return diag.FromErr(err) } @@ -160,14 +174,19 @@ func StateToBundle(ctx context.Context, state ExportedResourcesMap, config *conf }) } -func (l *load) validateState(state ExportedResourcesMap) error { - if len(state) == 0 && slices.Contains(l.modes, ErrorOnEmptyState) { +func validateLoadedState(state ExportedResourcesMap, modes []LoadMode) error { + if len(state) == 0 && slices.Contains(modes, ErrorOnEmptyState) { return errors.New("resource not found or not yet deployed. Did you forget to run 'databricks bundle deploy'?") } - return nil } func Load(engine engine.EngineType, modes ...LoadMode) bundle.Mutator { return &load{modes: modes, engine: engine} } + +// LoadFromState returns a mutator that loads the provided pre-computed state into the bundle, +// skipping the engine-specific state retrieval step. +func LoadFromState(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { + return &loadFromState{state: state, modes: modes} +} diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 5b1fbc3bf67..0399c7b31ff 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -198,7 +198,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index f4512f4e1f0..77d95e3533e 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -282,7 +282,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := deploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { From deae01f8a215b9c323d4489f511edaa8ea8bed36 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 13:52:02 +0200 Subject: [PATCH 51/80] statemgmt.Load: accept state directly instead of engine Callers now extract state before calling Load (ExportState for direct, ParseResourcesState for terraform). This removes the engine dispatch from inside the mutator and makes the data flow explicit. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 18 ++++++++------ bundle/statemgmt/state_load.go | 42 ++++---------------------------- cmd/bundle/generate/dashboard.go | 15 +++++++++++- cmd/bundle/utils/process.go | 14 ++++++++++- 4 files changed, 42 insertions(+), 47 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index e318fa1ffe6..00452346df0 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -75,10 +75,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta } // Flush WAL to state file on disk; capture the resulting state for Load below. - var directState statemgmt.ExportedResourcesMap + var state statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { var err error - directState, err = b.DeploymentBundle.StateDB.Finalize(ctx) + state, err = b.DeploymentBundle.StateDB.Finalize(ctx) if err != nil { logdiag.LogError(ctx, err) } @@ -90,15 +90,17 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } - var loadMutator bundle.Mutator - if targetEngine.IsDirect() { - loadMutator = statemgmt.LoadFromState(directState) - } else { - loadMutator = statemgmt.Load(targetEngine) + if !targetEngine.IsDirect() { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return + } } bundle.ApplySeqContext(ctx, b, - loadMutator, + statemgmt.Load(state), metadata.Compute(), metadata.Upload(), statemgmt.UploadStateForYamlSync(targetEngine), diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index 4894fc08a67..573c69126c2 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -9,9 +9,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" - "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" @@ -26,40 +24,15 @@ type ( const ErrorOnEmptyState LoadMode = 0 type load struct { - modes []LoadMode - engine engine.EngineType -} - -func (l *load) Name() string { - return "statemgmt.Load" -} - -func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - var state ExportedResourcesMap - - if l.engine.IsDirect() { - state = b.DeploymentBundle.ExportState(ctx) - } else { - var err error - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - return diag.FromErr(err) - } - } - - return applyState(ctx, b, state, l.modes) -} - -type loadFromState struct { state ExportedResourcesMap modes []LoadMode } -func (l *loadFromState) Name() string { +func (l *load) Name() string { return "statemgmt.Load" } -func (l *loadFromState) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { +func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { return applyState(ctx, b, l.state, l.modes) } @@ -181,12 +154,7 @@ func validateLoadedState(state ExportedResourcesMap, modes []LoadMode) error { return nil } -func Load(engine engine.EngineType, modes ...LoadMode) bundle.Mutator { - return &load{modes: modes, engine: engine} -} - -// LoadFromState returns a mutator that loads the provided pre-computed state into the bundle, -// skipping the engine-specific state retrieval step. -func LoadFromState(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { - return &loadFromState{state: state, modes: modes} +// Load returns a mutator that merges the provided resource state into the bundle configuration. +func Load(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { + return &load{state: state, modes: modes} } diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index ca02cc414ea..fefcd0f6e65 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -19,6 +19,7 @@ import ( "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/bundle/deployment" @@ -398,8 +399,20 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { } } + var state statemgmt.ExportedResourcesMap + if stateDesc.Engine.IsDirect() { + state = b.DeploymentBundle.ExportState(ctx) + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return + } + } + bundle.ApplySeqContext(ctx, b, - statemgmt.Load(stateDesc.Engine), + statemgmt.Load(state), ) if logdiag.HasError(ctx) { return diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index c142f4d943e..5f43cff6acd 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -11,6 +11,7 @@ import ( "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/mutator" "github.com/databricks/cli/bundle/config/validate" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/bundle/direct/dstate" @@ -200,8 +201,19 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.ErrorOnEmptyState { modes = append(modes, statemgmt.ErrorOnEmptyState) } + var state statemgmt.ExportedResourcesMap + if stateDesc.Engine.IsDirect() { + state = b.DeploymentBundle.ExportState(ctx) + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return b, stateDesc, root.ErrAlreadyPrinted + } + } mutators := []bundle.Mutator{ - statemgmt.Load(stateDesc.Engine, modes...), + statemgmt.Load(state, modes...), } // InitializeURLs makes an extra API call; only run it when URLs are needed. if opts.InitIDs { From f896d361dbef068b4ed2f8a89fc2f48130b459ce Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:00:47 +0200 Subject: [PATCH 52/80] fmt --- cmd/bundle/generate/dashboard.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index fefcd0f6e65..71f4f573cf5 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -16,10 +16,10 @@ import ( "time" "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/bundle/deployment" From 08b304e5651975b518f524c02a2b6f0698c1d75a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:03:35 +0200 Subject: [PATCH 53/80] deployCore: move ParseResourcesState before PushResourcesState Both engines now capture post-apply state in the same location, before pushing. The two operations are independent reads of the terraform state file, so order relative to PushResourcesState does not matter. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 00452346df0..6c03ac8870d 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -74,7 +74,9 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Flush WAL to state file on disk; capture the resulting state for Load below. + // Capture post-apply state for Load below. + // For direct: flush WAL to disk (Finalize) and capture the result. + // For terraform: parse the state file written by terraform.Apply. var state statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { var err error @@ -82,6 +84,12 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta if err != nil { logdiag.LogError(ctx, err) } + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + } } // Even if deployment failed, there might be updates in states that we need to upload @@ -90,15 +98,6 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } - if !targetEngine.IsDirect() { - var err error - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - logdiag.LogError(ctx, err) - return - } - } - bundle.ApplySeqContext(ctx, b, statemgmt.Load(state), metadata.Compute(), From 32b498845b2998c753683b04f2c4794ed524c60c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:26:00 +0200 Subject: [PATCH 54/80] simplify test --- .../bundle/deploy/wal/corrupted-wal-entry/script | 6 +----- .../bundle/deploy/wal/corrupted-wal-entry/test.toml | 13 ------------- 2 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 191a62f01fc..043a13d9971 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -12,11 +12,7 @@ echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' echo "=== Corrupted WAL entries file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - cat .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Missing corrupted WAL entries file (unexpected)" -fi +cat .databricks/bundle/default/resources.json.wal.corrupted echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml deleted file mode 100644 index 6245c198409..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ /dev/null @@ -1,13 +0,0 @@ -# WAL with corrupted LAST entry - valid entries should be recovered, corrupted last line skipped. - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=1111" -Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=2222" -Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' From f628251fc0c3b6eb639e05d89486031a19340cef Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:47:46 +0200 Subject: [PATCH 55/80] update outputs --- .../bundle/deploy/wal/chain-3-jobs/output.txt | 4 +-- .../deploy/wal/corrupted-wal-entry/output.txt | 33 ++++++++++++------- .../corrupted-wal-entry/resources.json.wal | 4 --- .../deploy/wal/corrupted-wal-entry/script | 21 +++++++++--- .../deploy/wal/crash-after-create/output.txt | 2 +- .../deploy/wal/future-serial-wal/output.txt | 2 +- .../deploy/wal/lineage-mismatch/output.txt | 2 +- .../bundle/deploy/wal/stale-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/test.toml | 8 ----- 9 files changed, 44 insertions(+), 34 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 1f4b53f7cf8..8ca8e388d39 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "[ID]", + "__id__": "1001", "state": { "deployment": { "kind": "BUNDLE", @@ -50,7 +50,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "[ID]", + "__id__": "1001", "depends_on": [ { "label": "${resources.jobs.job_01.id}", diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index bd886c153fe..f92d8a67acf 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,9 +1,8 @@ === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== Deploy (should recover valid entries, skip corrupted last line) === +{"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input @@ -13,13 +12,23 @@ Deploying resources... Updating deployment state... Deployment complete! === Final state (should have recovered entries) === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.another_valid", - "resources.jobs.valid_job" - ] -} + +>>> [CLI] bundle summary +Name: wal-corrupted-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default +Resources: + Jobs: + another_valid: + Name: another-valid + URL: [DATABRICKS_URL]/jobs/[JOB2_ID]?o=[NUMID] + valid_job: + Name: valid-job + URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after successful deploy === + +>>> cat .databricks/bundle/default/resources.json.wal.corrupted +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal deleted file mode 100644 index 4791ba12814..00000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal +++ /dev/null @@ -1,4 +0,0 @@ -{"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 043a13d9971..16cee304deb 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -1,18 +1,31 @@ +# Create pre-existing jobs in the testserver so WAL recovery triggers DoUpdate (reset) instead of DoCreate +JOB1=$($CLI jobs create --json '{"name":"valid-job"}' | jq -r '.job_id') +JOB2=$($CLI jobs create --json '{"name":"another-valid"}' | jq -r '.job_id') +echo "$JOB1:JOB1_ID" >> ACC_REPLS +echo "$JOB2:JOB2_ID" >> ACC_REPLS + mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ -cp resources.json.wal .databricks/bundle/default/ + +# Generate WAL with actual job IDs; truncate the partial_write entry to simulate corruption +{ + printf '{"lineage":"test-lineage-123","serial":6}\n' + printf '{"k":"resources.jobs.valid_job","v":{"__id__":"%s","state":{"name":"valid-job"}}}\n' "$JOB1" + printf '{"k":"resources.jobs.another_valid","v":{"__id__":"%s","state":{"name":"another-valid"}}}\n' "$JOB2" + printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' +} > .databricks/bundle/default/resources.json.wal echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" -trace $CLI bundle deploy 2>&1 +trace $CLI bundle deploy echo "=== Final state (should have recovered entries) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +trace $CLI bundle summary echo "=== Corrupted WAL entries file ===" -cat .databricks/bundle/default/resources.json.wal.corrupted +trace cat .databricks/bundle/default/resources.json.wal.corrupted echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index b3250b2db1a..abc6d177f65 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index 8fc16565fe3..adb68c7c73b 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,6 +1,6 @@ === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index f090a161637..53c517b5838 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,6 +1,6 @@ === WAL content === {"lineage":"wal-lineage-bbb","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index a2066ccdd8f..d51d94d9657 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,6 +1,6 @@ === WAL content before deploy === {"lineage":"stale-test-lineage","serial": [SERIAL]} -{"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} === Deploy (should ignore stale WAL) === >>> [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 266d748049c..c4b21c01133 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -50,14 +50,6 @@ New = '"lineage": "[UUID]"' Old = '"serial":\s*\d+' New = '"serial": [SERIAL]' -[[Repls]] -Old = '"__id__":\s*"\d+"' -New = '"__id__": "[ID]"' - -[[Repls]] -Old = '"job_id":\s*"\d+"' -New = '"job_id": "[ID]"' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' From 2c6d40cc8b1c00eb45dc9fb80bf2ced246b6a1c9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:55:51 +0200 Subject: [PATCH 56/80] fix Windows replacement for process kill during deployment The old Windows rule matched 'Exit code: [KILLED]' which was never present because 'Exit code: 1' (Windows exit code for kill) was never normalized to '[KILLED]' -- the [KILLED] normalization only fires via exit code 137 (Linux) or after [PROCESS_KILLED] is already inserted. Match 'Exit code: 1' directly (the raw Windows exit code), then insert [PROCESS_KILLED] and normalize in one step. Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/test.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index c4b21c01133..23a203beb15 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -30,9 +30,9 @@ Old = '(\[PROCESS_KILLED\]\n\nExit code: )1' New = '${1}[KILLED]' # On Windows, no bash "Killed" message appears when CLI has produced output before termination. -# Insert [PROCESS_KILLED] between last output line and exit code for consistency. +# Match the raw exit code 1 (Windows never gets 137 or [PROCESS_KILLED] marker first). [[Repls]] -Old = '(Deploying resources\.\.\.)\n\nExit code: \[KILLED\]' +Old = '(Deploying resources\.\.\.)\n\nExit code: 1' New = """${1} [PROCESS_KILLED] From 38c175641fc1b0e0db0ba7568d16e8b2cc8f4f97 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:05:22 +0200 Subject: [PATCH 57/80] formatting --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 3 ++- acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f92d8a67acf..bf95cc13949 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -30,5 +30,6 @@ Resources: === Corrupted WAL entries file === >>> cat .databricks/bundle/default/resources.json.wal.corrupted -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== WAL after successful deploy === +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- +=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 16cee304deb..b6b12c347b5 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -27,7 +27,7 @@ trace $CLI bundle summary echo "=== Corrupted WAL entries file ===" trace cat .databricks/bundle/default/resources.json.wal.corrupted -echo "=== WAL after successful deploy ===" +printf "\n=== WAL after successful deploy ===\n" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" else From f27e4ca1f896f55c2cd65173862ae55e71c247ba Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:07:45 +0200 Subject: [PATCH 58/80] clean up --- acceptance/bundle/deploy/wal/test.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 23a203beb15..de4389e6f80 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -38,10 +38,6 @@ New = """${1} Exit code: [KILLED]""" -[[Repls]] -Old = "\r" -New = '' - [[Repls]] Old = '"lineage":\s*"[0-9a-f-]+"' New = '"lineage": "[UUID]"' From e113406104bd9a98058ce7abdabcd72605cecfe8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:09:11 +0200 Subject: [PATCH 59/80] rm unnecessarial SERIAL replacement --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 2 +- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 2 +- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/future-serial-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/lineage-mismatch/output.txt | 2 +- acceptance/bundle/deploy/wal/normal-deploy/output.txt | 2 +- acceptance/bundle/deploy/wal/stale-wal/output.txt | 4 ++-- acceptance/bundle/deploy/wal/test.toml | 4 ---- acceptance/bundle/deploy/wal/wal-with-delete/output.txt | 4 ++-- 10 files changed, 11 insertions(+), 15 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 8ca8e388d39..bb41f0784bc 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -11,7 +11,7 @@ Exit code: [KILLED] { "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_version": 2 } { diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index bf95cc13949..d04d0389ecb 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"test-lineage-123","serial": [SERIAL]} +{"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index abc6d177f65..cc6111ea9b3 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -8,7 +8,7 @@ Deploying resources... Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} +{"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 884f5027445..b4ce67ee661 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -16,7 +16,7 @@ Corrupted WAL file missing (expected) === State file content === { "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index adb68c7c73b..48c23ddf84f 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"test-lineage-123","serial": [SERIAL]} +{"lineage":"test-lineage-123","serial":5} {"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with corruption error) === diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 53c517b5838..00bc78cf28d 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"wal-lineage-bbb","serial": [SERIAL]} +{"lineage":"wal-lineage-bbb","serial":2} {"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with lineage mismatch error) === diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt index ccb189ff09b..2ca4f5f51c1 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -9,7 +9,7 @@ WAL file deleted after successful deploy (expected) === State file content === { "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index d51d94d9657..91a7a07643d 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,5 +1,5 @@ === WAL content before deploy === -{"lineage":"stale-test-lineage","serial": [SERIAL]} +{"lineage":"stale-test-lineage","serial":1} {"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} === Deploy (should ignore stale WAL) === @@ -12,7 +12,7 @@ Deployment complete! Stale WAL deleted (expected) === State file should NOT contain stale_job === { - "serial": [SERIAL], + "serial": 3, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index de4389e6f80..0ee34873e05 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -42,10 +42,6 @@ Exit code: [KILLED]""" Old = '"lineage":\s*"[0-9a-f-]+"' New = '"lineage": "[UUID]"' -[[Repls]] -Old = '"serial":\s*\d+' -New = '"serial": [SERIAL]' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index a7960906d3f..c08e3651772 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"delete-test-lineage","serial": [SERIAL]} +{"lineage":"delete-test-lineage","serial":2} {"k":"resources.jobs.test_job","v":null} === Deploy (should recover delete from WAL) === @@ -10,7 +10,7 @@ Updating deployment state... Deployment complete! === Final state (should have no jobs) === { - "serial": [SERIAL], + "serial": 2, "state_keys": [] } === WAL after successful deploy === From 190ce16f319da3bcebf94cfb23daac86048fe25a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:09:56 +0200 Subject: [PATCH 60/80] rm noop replacement for lineage --- acceptance/bundle/deploy/wal/test.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 0ee34873e05..0e2bc852dcc 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -38,10 +38,6 @@ New = """${1} Exit code: [KILLED]""" -[[Repls]] -Old = '"lineage":\s*"[0-9a-f-]+"' -New = '"lineage": "[UUID]"' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' From eca0376abddf928f2f190d86652b484d13205dd8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:13:11 +0200 Subject: [PATCH 61/80] clean up --- acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml | 3 --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 2 -- .../bundle/deploy/wal/corrupted-wal-entry/databricks.yml | 2 -- .../bundle/deploy/wal/crash-after-create/databricks.yml | 2 -- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/databricks.yml | 1 - .../bundle/deploy/wal/future-serial-wal/databricks.yml | 1 - acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml | 1 - acceptance/bundle/deploy/wal/normal-deploy/databricks.yml | 1 - acceptance/bundle/deploy/wal/stale-wal/databricks.yml | 1 - acceptance/bundle/deploy/wal/test.toml | 5 ----- 11 files changed, 1 insertion(+), 20 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml index fc3a46205bc..342a4516235 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml @@ -15,7 +15,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_02: name: "job-02" description: "depends on ${resources.jobs.job_01.id}" @@ -26,7 +25,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_03: name: "job-03" description: "depends on ${resources.jobs.job_02.id}" @@ -37,4 +35,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index bb41f0784bc..8c70ebafa3e 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,7 +35,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -74,7 +73,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml index cc9024fadab..a7a5cc2dfe0 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml @@ -12,7 +12,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 another_valid: name: "another-valid" tasks: @@ -22,4 +21,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml index 31480454c55..25b2efe2f8c 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -13,7 +13,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_b: name: "test-job-b" description: "depends on ${resources.jobs.job_a.id}" @@ -24,4 +23,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index cc6111ea9b3..a5cdd4f40df 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml index 147a1e1482f..8da92255ff1 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml index 67079aaef86..56fa1313376 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml index 014ec7f8860..32461d14676 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml index 413705d40cb..4439322e0e6 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml +++ b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml index 6b24f6fd269..443283607e6 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 0e2bc852dcc..2be1964ae6b 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -37,8 +37,3 @@ New = """${1} [PROCESS_KILLED] Exit code: [KILLED]""" - -# Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) -[[Repls]] -Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' -New = '' From 78ef5f1a6c5430735971ce8e0fb5e74440201de2 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:24:11 +0200 Subject: [PATCH 62/80] testserver: replace KillCaller config with HTTP kill API Move kill-on-request behavior from test.toml fields (KillCaller, KillCallerOffset) to a POST /__testserver/kill endpoint. Kill rules are scoped by auth token so concurrent tests sharing a server don't interfere. acceptance/bin/kill_after.py is a convenience wrapper that posts to the endpoint, keeping scripts readable. The kill check is applied at the HTTP middleware layer (wrapping the entire router) so it fires for all requests, including those that would otherwise fall through to the not-found handler. Co-authored-by: Isaac --- acceptance/bin/kill_after.py | 39 +++++++ .../bundle/deploy/wal/chain-3-jobs/script | 2 + .../bundle/deploy/wal/chain-3-jobs/test.toml | 2 - .../deploy/wal/crash-after-create/script | 2 + .../deploy/wal/crash-after-create/test.toml | 1 - acceptance/internal/config.go | 12 -- acceptance/internal/prepare_server.go | 57 --------- .../selftest/kill_caller/currentuser/script | 1 + .../kill_caller/currentuser/test.toml | 3 - .../kill_caller/multi_pattern/output.txt | 4 +- .../selftest/kill_caller/multi_pattern/script | 3 + .../kill_caller/multi_pattern/test.toml | 16 +-- .../selftest/kill_caller/multiple/output.txt | 4 +- .../selftest/kill_caller/multiple/script | 2 + .../selftest/kill_caller/multiple/test.toml | 9 -- .../selftest/kill_caller/offset/output.txt | 12 +- acceptance/selftest/kill_caller/offset/script | 2 + .../selftest/kill_caller/offset/test.toml | 10 -- .../selftest/kill_caller/workspace/script | 1 + .../selftest/kill_caller/workspace/test.toml | 3 - libs/testserver/kill.go | 108 ++++++++++++++++++ .../testserver}/process_unix.go | 2 +- .../testserver}/process_windows.go | 2 +- libs/testserver/server.go | 26 ++++- 24 files changed, 198 insertions(+), 125 deletions(-) create mode 100755 acceptance/bin/kill_after.py create mode 100644 libs/testserver/kill.go rename {acceptance/internal => libs/testserver}/process_unix.go (94%) rename {acceptance/internal => libs/testserver}/process_windows.go (96%) diff --git a/acceptance/bin/kill_after.py b/acceptance/bin/kill_after.py new file mode 100755 index 00000000000..029123a13f5 --- /dev/null +++ b/acceptance/bin/kill_after.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Set up a kill rule on the testserver for the current test token. + +Usage: kill_after.py PATTERN OFFSET TIMES + + PATTERN HTTP method and path, e.g. "POST /api/2.2/jobs/create" + OFFSET number of requests to let through before killing starts + TIMES number of times to kill the caller + +The rule is scoped to the current DATABRICKS_TOKEN so it only affects +the test that registers it, even when tests share a server. +""" + +import json +import os +import sys +import urllib.request + +host = os.environ.get("DATABRICKS_HOST", "") +token = os.environ.get("DATABRICKS_TOKEN", "") + +if not host: + print("DATABRICKS_HOST not set", file=sys.stderr) + sys.exit(1) + +if len(sys.argv) != 4: + print(f"usage: {sys.argv[0]} PATTERN OFFSET TIMES", file=sys.stderr) + sys.exit(1) + +pattern, offset, times = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]) + +data = json.dumps({"pattern": pattern, "offset": offset, "times": times}).encode() +req = urllib.request.Request( + f"{host}/__testserver/kill", + data=data, + headers={"Content-Type": "application/json", "Authorization": f"Bearer {token}"}, + method="POST", +) +urllib.request.urlopen(req) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 6c9993c2802..a1196f10c13 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,3 +1,5 @@ +kill_after.py "POST /api/2.2/jobs/create" 2 1 + echo "=== First deploy (crashes on job_03) ===" trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 2425c89deae..746896a789e 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -3,8 +3,6 @@ [[Server]] Pattern = "POST /api/2.2/jobs/create" -KillCallerOffset = 2 -KillCaller = 1 Response.Body = '{"job_id": 1001}' [[Server]] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index d09f6ab06eb..bb33d678700 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.2/jobs/get" 0 1 + echo "=== First deploy (crashes after job_a create, before job_b) ===" trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index eebad72de53..d1e99eadb73 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -12,5 +12,4 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/internal/config.go b/acceptance/internal/config.go index dc63911173c..559e11d0ca6 100644 --- a/acceptance/internal/config.go +++ b/acceptance/internal/config.go @@ -153,18 +153,6 @@ type ServerStub struct { // Configure as "1ms", "2s", "3m", etc. // See [time.ParseDuration] for details. Delay time.Duration - - // Number of times to kill the caller process before returning normal responses. - // 0 = never kill (default), 1 = kill once then allow, 2 = kill twice then allow, etc. - // Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds. - // Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment. - KillCaller int - - // Number of requests to let pass before starting to kill. - // Combined with KillCaller, this creates a window: requests 1 to Offset succeed, - // requests Offset+1 to Offset+KillCaller are killed, rest succeed. - // Example: KillCallerOffset=9, KillCaller=1 means let 9 requests pass, kill the 10th. - KillCallerOffset int } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index 2f1b6712a2f..f8be1ae947a 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,10 +183,6 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - killCounters := make(map[string]int) - offsetCounters := make(map[string]int) - killCountersMu := &sync.Mutex{} - for ind := range stubs { // Later stubs take precedence over earlier ones (leaf configs override parent configs). // The first handler registered for a given pattern wins, so we reverse the order. @@ -195,11 +191,6 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - if stub.KillCaller > 0 { - killCounters[stub.Pattern] = stub.KillCaller - offsetCounters[stub.Pattern] = stub.KillCallerOffset - } - s.Handle(items[0], items[1], func(req testserver.Request) any { if stub.Delay > 0 { ctx := req.Context @@ -218,10 +209,6 @@ func startLocalServer(t *testing.T, } } - if shouldKillCaller(stub, offsetCounters, killCounters, killCountersMu) { - killCaller(t, stub.Pattern, req.Headers) - } - return stub.Response }) } @@ -232,50 +219,6 @@ func startLocalServer(t *testing.T, return s.URL } -func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]int, mu *sync.Mutex) bool { - if stub.KillCaller <= 0 { - return false - } - mu.Lock() - defer mu.Unlock() - - if offsetCounters[stub.Pattern] > 0 { - offsetCounters[stub.Pattern]-- - return false - } - - if killCounters[stub.Pattern] <= 0 { - return false - } - killCounters[stub.Pattern]-- - return true -} - -func killCaller(t *testing.T, pattern string, headers http.Header) { - pid := testserver.ExtractPidFromHeaders(headers) - if pid == 0 { - t.Errorf("KillCaller configured but test-pid not found in User-Agent") - return - } - - process, err := os.FindProcess(pid) - if err != nil { - t.Errorf("Failed to find process %d: %s", pid, err) - return - } - - // Use process.Kill() for cross-platform compatibility. - // On Unix, this sends SIGKILL. On Windows, this calls TerminateProcess. - if err := process.Kill(); err != nil { - t.Errorf("Failed to kill process %d: %s", pid, err) - return - } - - if !waitForProcessExit(pid, 2*time.Second) { - t.Logf("KillCaller: timed out waiting for PID %d to exit (pattern: %s)", pid, pattern) - } - t.Logf("KillCaller: killed PID %d (pattern: %s)", pid, pattern) -} func startProxyServer(t *testing.T, recordRequests bool, diff --git a/acceptance/selftest/kill_caller/currentuser/script b/acceptance/selftest/kill_caller/currentuser/script index 821c42d8cf7..bbac4ab29ab 100644 --- a/acceptance/selftest/kill_caller/currentuser/script +++ b/acceptance/selftest/kill_caller/currentuser/script @@ -1,2 +1,3 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 1 trace errcode $CLI current-user me echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/currentuser/test.toml b/acceptance/selftest/kill_caller/currentuser/test.toml index b76fe401fcb..f6311367158 100644 --- a/acceptance/selftest/kill_caller/currentuser/test.toml +++ b/acceptance/selftest/kill_caller/currentuser/test.toml @@ -1,4 +1 @@ # Kill the CLI when it calls /Me endpoint (once, then allow) -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 1 diff --git a/acceptance/selftest/kill_caller/multi_pattern/output.txt b/acceptance/selftest/kill_caller/multi_pattern/output.txt index 9b41f23ec4d..b3528428352 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/output.txt +++ b/acceptance/selftest/kill_caller/multi_pattern/output.txt @@ -13,8 +13,8 @@ Me attempt 2 done >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Me attempt 3 done - success! diff --git a/acceptance/selftest/kill_caller/multi_pattern/script b/acceptance/selftest/kill_caller/multi_pattern/script index ba9447a29a7..e0b5523c45c 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/script +++ b/acceptance/selftest/kill_caller/multi_pattern/script @@ -1,3 +1,6 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 2 +kill_after.py "GET /api/2.0/workspace/list" 0 1 + # Test pattern 1: /Me endpoint (kills first 2, then allows) trace errcode $CLI current-user me echo "Me attempt 1 done" diff --git a/acceptance/selftest/kill_caller/multi_pattern/test.toml b/acceptance/selftest/kill_caller/multi_pattern/test.toml index 08bdc17085d..4565475423d 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/test.toml +++ b/acceptance/selftest/kill_caller/multi_pattern/test.toml @@ -1,17 +1,5 @@ -# Test that multiple patterns can have independent KillCaller counts -# Pattern 1: Kill first 2 requests to /Me endpoint -# Pattern 2: Kill first 1 request to /workspace/list endpoint - -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 2 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' +# Test that multiple patterns can have independent kill counts [[Server]] Pattern = "GET /api/2.0/workspace/list" -KillCaller = 1 +Response.Body = '{"objects": []}' diff --git a/acceptance/selftest/kill_caller/multiple/output.txt b/acceptance/selftest/kill_caller/multiple/output.txt index 27b034cfcb1..3b6aea849fd 100644 --- a/acceptance/selftest/kill_caller/multiple/output.txt +++ b/acceptance/selftest/kill_caller/multiple/output.txt @@ -19,7 +19,7 @@ Attempt 3 done >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 4 done - success! diff --git a/acceptance/selftest/kill_caller/multiple/script b/acceptance/selftest/kill_caller/multiple/script index 03628e203ed..a3659bf58fd 100644 --- a/acceptance/selftest/kill_caller/multiple/script +++ b/acceptance/selftest/kill_caller/multiple/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 3 + # First 3 attempts should be killed trace errcode $CLI current-user me echo "Attempt 1 done" diff --git a/acceptance/selftest/kill_caller/multiple/test.toml b/acceptance/selftest/kill_caller/multiple/test.toml index 5485fc6a6bb..24f7ca19229 100644 --- a/acceptance/selftest/kill_caller/multiple/test.toml +++ b/acceptance/selftest/kill_caller/multiple/test.toml @@ -1,10 +1 @@ # Kill the CLI 3 times, then allow the 4th request to succeed -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 3 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index cb87595a2c2..b6959aec5e2 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 5 done - success (past kill window) diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script index 3411e874806..1bf3d0d4c2e 100644 --- a/acceptance/selftest/kill_caller/offset/script +++ b/acceptance/selftest/kill_caller/offset/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 2 2 + # First 2 attempts should succeed (offset period) trace $CLI current-user me echo "Attempt 1 done - success (offset)" diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml index 5eab09dbfaa..7b8d50906c6 100644 --- a/acceptance/selftest/kill_caller/offset/test.toml +++ b/acceptance/selftest/kill_caller/offset/test.toml @@ -1,11 +1 @@ # Let first 2 requests pass, kill next 2, then allow rest -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCallerOffset = 2 -KillCaller = 2 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' diff --git a/acceptance/selftest/kill_caller/workspace/script b/acceptance/selftest/kill_caller/workspace/script index 076972136c9..8fb9dab3f1a 100644 --- a/acceptance/selftest/kill_caller/workspace/script +++ b/acceptance/selftest/kill_caller/workspace/script @@ -1,2 +1,3 @@ +kill_after.py "GET /api/2.0/workspace/list" 0 1 trace errcode $CLI workspace list / echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/workspace/test.toml b/acceptance/selftest/kill_caller/workspace/test.toml index eac10a6329b..80d2fbbfd17 100644 --- a/acceptance/selftest/kill_caller/workspace/test.toml +++ b/acceptance/selftest/kill_caller/workspace/test.toml @@ -1,4 +1 @@ # Kill the CLI when it calls workspace list endpoint (once, then allow) -[[Server]] -Pattern = "GET /api/2.0/workspace/list" -KillCaller = 1 diff --git a/libs/testserver/kill.go b/libs/testserver/kill.go new file mode 100644 index 00000000000..e24b13a0f11 --- /dev/null +++ b/libs/testserver/kill.go @@ -0,0 +1,108 @@ +package testserver + +import ( + "encoding/json" + "net/http" + "os" + "sync" + "time" + + "github.com/databricks/cli/internal/testutil" +) + +type killRuleKey struct { + token string + pattern string // "METHOD /path" +} + +type killRule struct { + offset int + times int +} + +type killRules struct { + mu sync.Mutex + rules map[killRuleKey]*killRule +} + +func newKillRules() *killRules { + return &killRules{rules: make(map[killRuleKey]*killRule)} +} + +func (kr *killRules) set(token, pattern string, offset, times int) { + kr.mu.Lock() + defer kr.mu.Unlock() + kr.rules[killRuleKey{token: token, pattern: pattern}] = &killRule{offset: offset, times: times} +} + +// check returns true if the caller should be killed for this request. +// It also performs the kill. +func (kr *killRules) check(t testutil.TestingT, method, path, token string, headers http.Header) bool { + pattern := method + " " + path + key := killRuleKey{token: token, pattern: pattern} + + kr.mu.Lock() + rule, ok := kr.rules[key] + if !ok { + kr.mu.Unlock() + return false + } + if rule.offset > 0 { + rule.offset-- + kr.mu.Unlock() + return false + } + if rule.times <= 0 { + delete(kr.rules, key) + kr.mu.Unlock() + return false + } + rule.times-- + if rule.times == 0 { + delete(kr.rules, key) + } + kr.mu.Unlock() + + killProcess(t, pattern, headers) + return true +} + +func killProcess(t testutil.TestingT, pattern string, headers http.Header) { + pid := ExtractPidFromHeaders(headers) + if pid == 0 { + t.Errorf("kill rule matched %q but test-pid not found in User-Agent", pattern) + return + } + + process, err := os.FindProcess(pid) + if err != nil { + t.Errorf("Failed to find process %d: %s", pid, err) + return + } + + if err := process.Kill(); err != nil { + t.Errorf("Failed to kill process %d: %s", pid, err) + return + } + + if !waitForProcessExit(pid, 2*time.Second) { + t.Logf("kill: timed out waiting for PID %d to exit (pattern: %s)", pid, pattern) + } + t.Logf("kill: killed PID %d (pattern: %s)", pid, pattern) +} + +// killEndpointHandler returns a HandlerFunc for POST /__testserver/kill. +func killEndpointHandler(kr *killRules) HandlerFunc { + return func(req Request) any { + var body struct { + Pattern string `json:"pattern"` + Offset int `json:"offset"` + Times int `json:"times"` + } + if err := json.Unmarshal(req.Body, &body); err != nil { + return Response{StatusCode: 400, Body: map[string]string{"error": err.Error()}} + } + kr.set(req.Token, body.Pattern, body.Offset, body.Times) + return Response{StatusCode: 200} + } +} diff --git a/acceptance/internal/process_unix.go b/libs/testserver/process_unix.go similarity index 94% rename from acceptance/internal/process_unix.go rename to libs/testserver/process_unix.go index 1e0b0ead3e1..8b82187580e 100644 --- a/acceptance/internal/process_unix.go +++ b/libs/testserver/process_unix.go @@ -1,6 +1,6 @@ //go:build linux || darwin -package internal +package testserver import ( "syscall" diff --git a/acceptance/internal/process_windows.go b/libs/testserver/process_windows.go similarity index 96% rename from acceptance/internal/process_windows.go rename to libs/testserver/process_windows.go index fdad8b4f5e2..2a32fe4ede0 100644 --- a/acceptance/internal/process_windows.go +++ b/libs/testserver/process_windows.go @@ -1,6 +1,6 @@ //go:build windows -package internal +package testserver import ( "time" diff --git a/libs/testserver/server.go b/libs/testserver/server.go index 40556e55294..aa05aee5ab4 100644 --- a/libs/testserver/server.go +++ b/libs/testserver/server.go @@ -46,6 +46,8 @@ type Server struct { fakeOidc *FakeOidc mu sync.Mutex + kills *killRules + RequestCallback func(request *Request) ResponseCallback func(request *Request, response *EncodedResponse) } @@ -58,6 +60,7 @@ type Request struct { Vars map[string]string Workspace *FakeWorkspace Context context.Context + Token string } type Response struct { @@ -200,7 +203,19 @@ func getHeaders(value []byte) http.Header { func New(t testutil.TestingT) *Server { router := NewRouter() - server := httptest.NewServer(router) + kills := newKillRules() + + // Wrap the router so kill rules fire for ALL requests, including those with + // no registered handler that would otherwise bypass serve() entirely. + killMiddleware := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + token := getToken(r) + if kills.check(t, r.Method, r.URL.Path, token, r.Header) { + return + } + router.ServeHTTP(w, r) + }) + + server := httptest.NewServer(killMiddleware) t.Cleanup(server.Close) s := &Server{ @@ -209,6 +224,7 @@ func New(t testutil.TestingT) *Server { t: t, fakeWorkspaces: map[string]*FakeWorkspace{}, fakeOidc: &FakeOidc{url: server.URL}, + kills: kills, } router.Dispatch = s.serve @@ -258,6 +274,9 @@ Response.Body = '' }) router.NotFound = notFoundFunc + // Register a test-only endpoint for setting up kill rules from scripts. + s.Handle("POST", "/__testserver/kill", killEndpointHandler(s.kills)) + // Register a default handler for the SDK's host metadata discovery endpoint. // The SDK resolves this during config initialization (as of v0.126.0) to // determine workspace/account IDs, cloud, and OIDC endpoints. Without this @@ -289,12 +308,15 @@ func (s *Server) getWorkspaceForToken(token string) *FakeWorkspace { } func (s *Server) serve(w http.ResponseWriter, r *http.Request, handler HandlerFunc, vars map[string]string) { + token := getToken(r) + // Each test uses unique DATABRICKS_TOKEN, we simulate each token having // it's own fake fakeWorkspace to avoid interference between tests. - fakeWorkspace := s.getWorkspaceForToken(getToken(r)) + fakeWorkspace := s.getWorkspaceForToken(token) request := NewRequest(s.t, r, fakeWorkspace) request.Vars = vars + request.Token = token if s.RequestCallback != nil { s.RequestCallback(&request) From 284c4db9da5acb2db48cd9fadbd5b4e9a4fa54c1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:25:44 +0200 Subject: [PATCH 63/80] remove blank line --- acceptance/internal/prepare_server.go | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index f8be1ae947a..299d48f03ee 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -219,7 +219,6 @@ func startLocalServer(t *testing.T, return s.URL } - func startProxyServer(t *testing.T, recordRequests bool, logRequests bool, From d2362e202ddf5b4a42fc56ee62af0f226a939c5c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:36:14 +0200 Subject: [PATCH 64/80] wal tests: remove redundant server stubs covered by default handlers Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 10 +++++----- acceptance/bundle/deploy/wal/chain-3-jobs/test.toml | 12 ------------ .../bundle/deploy/wal/crash-after-create/output.txt | 2 +- .../bundle/deploy/wal/crash-after-create/test.toml | 11 ----------- 4 files changed, 6 insertions(+), 29 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 8c70ebafa3e..e675bb689d3 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "1001", + "__id__": "[NUMID]", "state": { "deployment": { "kind": "BUNDLE", @@ -49,7 +49,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "1001", + "__id__": "[NUMID]", "depends_on": [ { "label": "${resources.jobs.job_01.id}", @@ -61,7 +61,7 @@ Exit code: [KILLED] "kind": "BUNDLE", "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" }, - "description": "depends on 1001", + "description": "depends on [NUMID]", "edit_mode": "UI_LOCKED", "format": "MULTI_TASK", "max_concurrent_runs": 1, @@ -98,10 +98,10 @@ Resources: Jobs: job_01: Name: job-01 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] job_02: Name: job-02 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] job_03: Name: job-03 URL: (not deployed) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 746896a789e..932f3ae97a4 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -1,14 +1,2 @@ # Linear chain: job_01 -> job_02 -> job_03 # Let first 2 jobs/create succeed, then kill on the 3rd - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index a5cdd4f40df..4eb2e1ea122 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"[NUMID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index d1e99eadb73..8e4ca4a8495 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -2,14 +2,3 @@ # Second deploy recovers from WAL and completes successfully. # job_b depends on job_a, so jobs/get is called after job_a's SaveState. -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' From 22fd654cab29a827eb4eca1063dfd5b9aca2e3e5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:40:21 +0200 Subject: [PATCH 65/80] wal tests: move test.toml comments to script, remove empty test.toml files Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/chain-3-jobs/script | 2 ++ acceptance/bundle/deploy/wal/chain-3-jobs/test.toml | 2 -- acceptance/bundle/deploy/wal/crash-after-create/script | 3 +++ acceptance/bundle/deploy/wal/crash-after-create/test.toml | 4 ---- 4 files changed, 5 insertions(+), 6 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/test.toml delete mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index a1196f10c13..2bd55befcd8 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,3 +1,5 @@ +# Linear chain: job_01 -> job_02 -> job_03 +# Let first 2 jobs/create succeed, then kill on the 3rd kill_after.py "POST /api/2.2/jobs/create" 2 1 echo "=== First deploy (crashes on job_03) ===" diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml deleted file mode 100644 index 932f3ae97a4..00000000000 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ /dev/null @@ -1,2 +0,0 @@ -# Linear chain: job_01 -> job_02 -> job_03 -# Let first 2 jobs/create succeed, then kill on the 3rd diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index bb33d678700..f4dba936bba 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,3 +1,6 @@ +# WAL recovery after real crash. First deploy creates job_a then crashes. +# Second deploy recovers from WAL and completes successfully. +# job_b depends on job_a, so jobs/get is called after job_a's SaveState. kill_after.py "GET /api/2.2/jobs/get" 0 1 echo "=== First deploy (crashes after job_a create, before job_b) ===" diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml deleted file mode 100644 index 8e4ca4a8495..00000000000 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL recovery after real crash. First deploy creates job_a then crashes. -# Second deploy recovers from WAL and completes successfully. -# job_b depends on job_a, so jobs/get is called after job_a's SaveState. - From 853b56b0dae0cfee46f09056a9dee040611bca83 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 16:55:17 +0200 Subject: [PATCH 66/80] Add databricks.yml --- databricks.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 databricks.yml diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 00000000000..7cf210722a2 --- /dev/null +++ b/databricks.yml @@ -0,0 +1,19 @@ +bundle: + name: git + git: + # This is currently not supported + branch: ${var.deployment_branch} + +variables: + deployment_branch: + # By setting deployment_branch to "" we set bundle.git.branch to "" which is the same unsetting it. + # This this should make CLI read branch from git and update bundle.git.branch accordingly. It should + # Also set bundle.git.inferred to true. + default: "" + +targets: + prod: + default: true + dev: + variables: + deployment_branch: dev-branch From 607f657270ef57ddd8b2fe73adc3810087d91329 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:01:35 +0200 Subject: [PATCH 67/80] clean up --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 8 -------- acceptance/bundle/deploy/wal/chain-3-jobs/script | 4 ---- 2 files changed, 12 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index e675bb689d3..14585774327 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -106,13 +106,5 @@ Resources: Name: job-03 URL: (not deployed) -=== Second deploy (recovery) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! - === WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 2bd55befcd8..e874a0fcac9 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -17,10 +17,6 @@ echo "" echo "=== Bundle summary (reads from WAL) ===" $CLI bundle summary -echo "" -echo "=== Second deploy (recovery) ===" -trace $CLI bundle deploy --force-lock - echo "" echo "=== WAL after successful deploy ===" cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" From 4b4a022e29a6f464595e4f0e1c694f1091e4055b Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:02:03 +0200 Subject: [PATCH 68/80] add replace_ids.py --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 10 +++++----- acceptance/bundle/deploy/wal/chain-3-jobs/script | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 14585774327..7e04ba4dae3 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "[NUMID]", + "__id__": "[JOB_01_ID]", "state": { "deployment": { "kind": "BUNDLE", @@ -49,7 +49,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "[NUMID]", + "__id__": "[JOB_02_ID]", "depends_on": [ { "label": "${resources.jobs.job_01.id}", @@ -61,7 +61,7 @@ Exit code: [KILLED] "kind": "BUNDLE", "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" }, - "description": "depends on [NUMID]", + "description": "depends on [JOB_01_ID]", "edit_mode": "UI_LOCKED", "format": "MULTI_TASK", "max_concurrent_runs": 1, @@ -98,10 +98,10 @@ Resources: Jobs: job_01: Name: job-01 - URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[JOB_01_ID]?o=[NUMID] job_02: Name: job-02 - URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[JOB_02_ID]?o=[NUMID] job_03: Name: job-03 URL: (not deployed) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index e874a0fcac9..a5afc6f51d5 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -20,3 +20,5 @@ $CLI bundle summary echo "" echo "=== WAL after successful deploy ===" cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" + +replace_ids.py From 29e0ca52deeb2eddb62a9c30e81d0242c8816ece Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:03:39 +0200 Subject: [PATCH 69/80] clean up --- .../bundle/deploy/wal/corrupted-wal-entry/output.txt | 8 +++----- acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 9 +-------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index d04d0389ecb..afd717a27b3 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,9 +1,9 @@ -=== WAL content === + +>>> cat .databricks/bundle/default/resources.json.wal {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === - +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted @@ -11,7 +11,6 @@ Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test Deploying resources... Updating deployment state... Deployment complete! -=== Final state (should have recovered entries) === >>> [CLI] bundle summary Name: wal-corrupted-test @@ -27,7 +26,6 @@ Resources: valid_job: Name: valid-job URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] -=== Corrupted WAL entries file === >>> cat .databricks/bundle/default/resources.json.wal.corrupted {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index b6b12c347b5..ae828cdb6b1 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -15,16 +15,9 @@ cp resources.json .databricks/bundle/default/ printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' } > .databricks/bundle/default/resources.json.wal -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" +trace cat .databricks/bundle/default/resources.json.wal trace $CLI bundle deploy - -echo "=== Final state (should have recovered entries) ===" trace $CLI bundle summary - -echo "=== Corrupted WAL entries file ===" trace cat .databricks/bundle/default/resources.json.wal.corrupted printf "\n=== WAL after successful deploy ===\n" From c98b3dd01d771e5d51b70ad4ccdb9bc58575b002 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:07:56 +0200 Subject: [PATCH 70/80] test more commands for validation --- .../bundle/deploy/wal/lineage-mismatch/out.test.toml | 1 + .../bundle/deploy/wal/lineage-mismatch/output.txt | 11 ++++------- acceptance/bundle/deploy/wal/lineage-mismatch/script | 7 +++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml index e90b6d5d1ba..9448f875df7 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -1,3 +1,4 @@ Local = true Cloud = false +EnvMatrix.COMMAND = ["deploy", "plan", "summary"] EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 00bc78cf28d..cae1ffac083 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,10 +1,7 @@ -=== WAL content === -{"lineage":"wal-lineage-bbb","serial":2} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -=== Deploy (should fail with lineage mismatch error) === - ->>> errcode [CLI] bundle deploy +Any command should fail with lineage mismatch error Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) -Exit code: 1 +>>> musterr [CLI] bundle destroy --auto-approve +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) + diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script index 4617c338fe7..0629a37c0f9 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/script +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -2,8 +2,7 @@ mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ cp resources.json.wal .databricks/bundle/default/ -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal +echo "Any command should fail with lineage mismatch error" +musterr $CLI bundle $COMMAND -echo "=== Deploy (should fail with lineage mismatch error) ===" -trace errcode $CLI bundle deploy +trace musterr $CLI bundle destroy --auto-approve From f151e71c6058a73064dc560557aaf472e62166d7 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:08:33 +0200 Subject: [PATCH 71/80] remove normal-deploy test --- .../deploy/wal/normal-deploy/databricks.yml | 14 -------------- .../deploy/wal/normal-deploy/out.test.toml | 3 --- .../bundle/deploy/wal/normal-deploy/output.txt | 16 ---------------- .../bundle/deploy/wal/normal-deploy/script | 12 ------------ .../bundle/deploy/wal/normal-deploy/test.py | 1 - .../bundle/deploy/wal/normal-deploy/test.toml | 9 --------- 6 files changed, 55 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/output.txt delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/script delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.py delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.toml diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml deleted file mode 100644 index 4439322e0e6..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml +++ /dev/null @@ -1,14 +0,0 @@ -bundle: - name: wal-test - -resources: - jobs: - test_job: - name: "test-job" - tasks: - - task_key: "test-task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml deleted file mode 100644 index e90b6d5d1ba..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt deleted file mode 100644 index 2ca4f5f51c1..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ /dev/null @@ -1,16 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Checking WAL file after deploy === -WAL file deleted after successful deploy (expected) -=== State file content === -{ - "lineage": "[UUID]", - "serial": 1, - "state_keys": [ - "resources.jobs.test_job" - ] -} diff --git a/acceptance/bundle/deploy/wal/normal-deploy/script b/acceptance/bundle/deploy/wal/normal-deploy/script deleted file mode 100644 index 5acc4d9b589..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/script +++ /dev/null @@ -1,12 +0,0 @@ -trace $CLI bundle deploy - -echo "=== Checking WAL file after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected - should be deleted after Finalize)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL file deleted after successful deploy (expected)" -fi - -echo "=== State file content ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.py b/acceptance/bundle/deploy/wal/normal-deploy/test.py deleted file mode 100644 index 1ff8e07c707..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.toml b/acceptance/bundle/deploy/wal/normal-deploy/test.toml deleted file mode 100644 index 1299046974a..00000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/test.toml +++ /dev/null @@ -1,9 +0,0 @@ -# WAL is created during deploy, used for state tracking, and deleted after Finalize. - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' From cae10125780fcb3b2ad6ccd303c6bf8d6e91119d Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:10:07 +0200 Subject: [PATCH 72/80] clean up --- .../bundle/deploy/wal/wal-with-delete/output.txt | 14 ++++++++------ .../bundle/deploy/wal/wal-with-delete/script | 9 +-------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index c08e3651772..4eb0fb5724a 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,9 +9,11 @@ Deploying resources... Updating deployment state... Deployment complete! === Final state (should have no jobs) === -{ - "serial": 2, - "state_keys": [] -} -=== WAL after successful deploy === -WAL deleted (expected) + +>>> [CLI] bundle summary +Name: wal-delete-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default +Resources: diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script index 5d5a78a885b..1b6708bc0ff 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/script +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -9,11 +9,4 @@ echo "=== Deploy (should recover delete from WAL) ===" trace $CLI bundle deploy echo "=== Final state (should have no jobs) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi +trace $CLI bundle summary From 088ed09c06656d7db27ca5506d2c1c3ebb244eea Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:30:08 +0200 Subject: [PATCH 73/80] test recover in plan/deploy/summary --- .../wal/crash-after-create/out.test.toml | 1 + .../deploy/wal/crash-after-create/output.txt | 55 ++++++++++++++++--- .../deploy/wal/crash-after-create/script | 30 +++------- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml index e90b6d5d1ba..1d895a16c96 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -1,3 +1,4 @@ Local = true Cloud = false +EnvMatrix.COMMAND = ["plan", "deploy --force-lock", "summary"] EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 4eb2e1ea122..0a50333e729 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -6,11 +6,52 @@ Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] -=== WAL should exist after crash === -WAL exists (expected) -{"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"[NUMID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== State file after crash (should be empty) === -cat: .databricks/bundle/default/resources.json: No such file or directory -Exit code: 1 +>>> assert_exists.py .databricks/bundle/default/resources.json.wal + +>>> assert_not_exists.py .databricks/bundle/default/resources.json + +>>> cat .databricks/bundle/default/resources.json.wal +{ + "lineage": "[UUID]", + "serial": 1, + "state_version": 2, + "cli_version": "[DEV_VERSION]" +} +{ + "k": "resources.jobs.job_a", + "v": { + "__id__": "[NUMID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json" + }, + "description": "first job", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "test-job-a", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py" + }, + "task_key": "task-a" + } + ] + } + } +} + +=== Any other command recovers state +>>> assert_exists.py .databricks/bundle/default/resources.json + +>>> assert_not_exists.py .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index f4dba936bba..264d84648d3 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,31 +1,17 @@ # WAL recovery after real crash. First deploy creates job_a then crashes. # Second deploy recovers from WAL and completes successfully. # job_b depends on job_a, so jobs/get is called after job_a's SaveState. -kill_after.py "GET /api/2.2/jobs/get" 0 1 +kill_after.py "POST /api/2.2/jobs/create" 1 1 echo "=== First deploy (crashes after job_a create, before job_b) ===" trace errcode $CLI bundle deploy -echo "=== WAL should exist after crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (expected)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL missing (unexpected)" -fi +trace assert_exists.py .databricks/bundle/default/resources.json.wal +trace assert_not_exists.py .databricks/bundle/default/resources.json +trace cat .databricks/bundle/default/resources.json.wal | jq -echo "=== State file after crash (should be empty) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' +title "Any other command recovers state" +$CLI bundle $COMMAND &> LOG.COMMAND.txt -echo "=== Second deploy (should recover from WAL and complete) ===" -trace $CLI bundle deploy --force-lock - -echo "=== State file after recovery ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL file after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected)" -else - echo "WAL file deleted (expected)" -fi +trace assert_exists.py .databricks/bundle/default/resources.json +trace assert_not_exists.py .databricks/bundle/default/resources.json.wal From a52fa51dd758810133cba25d2e29ae9a338dab11 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:33:26 +0200 Subject: [PATCH 74/80] clean up --- .../bundle/deploy/wal/empty-wal/output.txt | 17 ------------ acceptance/bundle/deploy/wal/empty-wal/script | 26 +------------------ 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index b4ce67ee661..bba6d249fce 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -1,23 +1,6 @@ -=== Creating state directory === -=== Creating empty WAL file === -=== Empty WAL file exists === -[FILE_INFO] .databricks/bundle/default/resources.json.wal -=== Deploy (should handle empty WAL gracefully) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... Deploying resources... Updating deployment state... Deployment complete! -=== Checking WAL file after deploy === -Empty WAL deleted (expected) -=== Corrupted WAL file === -Corrupted WAL file missing (expected) -=== State file content === -{ - "lineage": "[UUID]", - "serial": 1, - "state_keys": [ - "resources.jobs.test_job" - ] -} diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index 3929de8eb1f..ac104951c58 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -1,28 +1,4 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating empty WAL file ===" touch .databricks/bundle/default/resources.json.wal - -echo "=== Empty WAL file exists ===" -ls -la .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should handle empty WAL gracefully) ===" trace $CLI bundle deploy - -echo "=== Checking WAL file after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected)" -else - echo "Empty WAL deleted (expected)" -fi - -echo "=== Corrupted WAL file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - ls -la .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Corrupted WAL file missing (expected)" -fi - -echo "=== State file content ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' +assert_not_exists.py .databricks/bundle/default/resources.json.wal* From 16de5c1f50ffa0e61d17c2fd39c71fcbefe0d87c Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:33:59 +0200 Subject: [PATCH 75/80] add assert_*.py --- acceptance/bin/assert_exists.py | 12 ++++++++++++ acceptance/bin/assert_not_exists.py | 12 ++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 acceptance/bin/assert_exists.py create mode 100644 acceptance/bin/assert_not_exists.py diff --git a/acceptance/bin/assert_exists.py b/acceptance/bin/assert_exists.py new file mode 100644 index 00000000000..0d33b46d2aa --- /dev/null +++ b/acceptance/bin/assert_exists.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +import os, sys + +errors = 0 + +for filename in sys.argv[1:]: + if not os.path.exists(filename): + sys.stderr.write(f"Unexpected: {filename} does not exist.\n") + errors += 1 + +if errors: + sys.exit(1) diff --git a/acceptance/bin/assert_not_exists.py b/acceptance/bin/assert_not_exists.py new file mode 100644 index 00000000000..76d467e4515 --- /dev/null +++ b/acceptance/bin/assert_not_exists.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +import os, sys + +errors = 0 + +for filename in sys.argv[1:]: + if os.path.exists(filename): + sys.stderr.write(f"Unexpected: {filename} exists.\n") + errors += 1 + +if errors: + sys.exit(1) From 3866db14378b59f573509596af25c97aca9b7726 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:42:11 +0200 Subject: [PATCH 76/80] corrupted-wal-entry: use envsubst + template file for WAL generation Co-authored-by: Isaac --- .../bundle/deploy/wal/corrupted-wal-entry/output.txt | 8 ++++---- .../wal/corrupted-wal-entry/resources.json.wal.tmpl | 4 ++++ acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 8 +------- 3 files changed, 9 insertions(+), 11 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index afd717a27b3..fa6e0819110 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,8 +1,8 @@ >>> cat .databricks/bundle/default/resources.json.wal {"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} +{"k":"resources.jobs.valid_job","v":{"__id__":"","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input @@ -22,10 +22,10 @@ Resources: Jobs: another_valid: Name: another-valid - URL: [DATABRICKS_URL]/jobs/[JOB2_ID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] valid_job: Name: valid-job - URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] >>> cat .databricks/bundle/default/resources.json.wal.corrupted {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl new file mode 100644 index 00000000000..44f3bbdaf40 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl @@ -0,0 +1,4 @@ +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"$JOB1","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"$JOB2","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- \ No newline at end of file diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index ae828cdb6b1..d6f151a29c6 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -7,13 +7,7 @@ echo "$JOB2:JOB2_ID" >> ACC_REPLS mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ -# Generate WAL with actual job IDs; truncate the partial_write entry to simulate corruption -{ - printf '{"lineage":"test-lineage-123","serial":6}\n' - printf '{"k":"resources.jobs.valid_job","v":{"__id__":"%s","state":{"name":"valid-job"}}}\n' "$JOB1" - printf '{"k":"resources.jobs.another_valid","v":{"__id__":"%s","state":{"name":"another-valid"}}}\n' "$JOB2" - printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' -} > .databricks/bundle/default/resources.json.wal +envsubst < resources.json.wal.tmpl > .databricks/bundle/default/resources.json.wal trace cat .databricks/bundle/default/resources.json.wal trace $CLI bundle deploy From ae293dd3293ed15d74f03f2725089559676d2efd Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:46:14 +0200 Subject: [PATCH 77/80] kill_caller selftests: move test.toml comments to script, remove empty test.toml files Co-authored-by: Isaac --- acceptance/selftest/kill_caller/currentuser/script | 1 + acceptance/selftest/kill_caller/currentuser/test.toml | 1 - acceptance/selftest/kill_caller/multiple/script | 1 + acceptance/selftest/kill_caller/multiple/test.toml | 1 - acceptance/selftest/kill_caller/offset/script | 1 + acceptance/selftest/kill_caller/offset/test.toml | 1 - acceptance/selftest/kill_caller/workspace/script | 1 + acceptance/selftest/kill_caller/workspace/test.toml | 1 - 8 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 acceptance/selftest/kill_caller/currentuser/test.toml delete mode 100644 acceptance/selftest/kill_caller/multiple/test.toml delete mode 100644 acceptance/selftest/kill_caller/offset/test.toml delete mode 100644 acceptance/selftest/kill_caller/workspace/test.toml diff --git a/acceptance/selftest/kill_caller/currentuser/script b/acceptance/selftest/kill_caller/currentuser/script index bbac4ab29ab..dbd96b12a94 100644 --- a/acceptance/selftest/kill_caller/currentuser/script +++ b/acceptance/selftest/kill_caller/currentuser/script @@ -1,3 +1,4 @@ +# Kill the CLI when it calls /Me endpoint (once, then allow) kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 1 trace errcode $CLI current-user me echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/currentuser/test.toml b/acceptance/selftest/kill_caller/currentuser/test.toml deleted file mode 100644 index f6311367158..00000000000 --- a/acceptance/selftest/kill_caller/currentuser/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI when it calls /Me endpoint (once, then allow) diff --git a/acceptance/selftest/kill_caller/multiple/script b/acceptance/selftest/kill_caller/multiple/script index a3659bf58fd..1e089f3cc0f 100644 --- a/acceptance/selftest/kill_caller/multiple/script +++ b/acceptance/selftest/kill_caller/multiple/script @@ -1,3 +1,4 @@ +# Kill the CLI 3 times, then allow the 4th request to succeed kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 3 # First 3 attempts should be killed diff --git a/acceptance/selftest/kill_caller/multiple/test.toml b/acceptance/selftest/kill_caller/multiple/test.toml deleted file mode 100644 index 24f7ca19229..00000000000 --- a/acceptance/selftest/kill_caller/multiple/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI 3 times, then allow the 4th request to succeed diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script index 1bf3d0d4c2e..6abee0dcac7 100644 --- a/acceptance/selftest/kill_caller/offset/script +++ b/acceptance/selftest/kill_caller/offset/script @@ -1,3 +1,4 @@ +# Let first 2 requests pass, kill next 2, then allow rest kill_after.py "GET /api/2.0/preview/scim/v2/Me" 2 2 # First 2 attempts should succeed (offset period) diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml deleted file mode 100644 index 7b8d50906c6..00000000000 --- a/acceptance/selftest/kill_caller/offset/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Let first 2 requests pass, kill next 2, then allow rest diff --git a/acceptance/selftest/kill_caller/workspace/script b/acceptance/selftest/kill_caller/workspace/script index 8fb9dab3f1a..5a21881ab3f 100644 --- a/acceptance/selftest/kill_caller/workspace/script +++ b/acceptance/selftest/kill_caller/workspace/script @@ -1,3 +1,4 @@ +# Kill the CLI when it calls workspace list endpoint (once, then allow) kill_after.py "GET /api/2.0/workspace/list" 0 1 trace errcode $CLI workspace list / echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/workspace/test.toml b/acceptance/selftest/kill_caller/workspace/test.toml deleted file mode 100644 index 80d2fbbfd17..00000000000 --- a/acceptance/selftest/kill_caller/workspace/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI when it calls workspace list endpoint (once, then allow) From 202c0ac172c3a16884117aa90221afae6f2cb566 Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 13:25:35 +0200 Subject: [PATCH 78/80] formatting --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 1 + .../deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index fa6e0819110..1aee4fe481d 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -4,6 +4,7 @@ {"k":"resources.jobs.valid_job","v":{"__id__":"","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- + >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl index 44f3bbdaf40..7ef5773a4ea 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl @@ -1,4 +1,4 @@ {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"$JOB1","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"$JOB2","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- \ No newline at end of file +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- From 9a1bb574007f24c39a76b8e1118bd9a54eec180d Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 15:43:37 +0200 Subject: [PATCH 79/80] fix CI: commit missing test.tomls and fix assert_*.py permissions - acceptance/bundle/deploy/wal/crash-after-create/test.toml and lineage-mismatch/test.toml were untracked; scripts using $COMMAND failed with "unbound variable" on CI - assert_exists.py and assert_not_exists.py were tracked as 100644; CI ran them as non-executable, producing "Permission denied" errors Co-authored-by: Isaac --- acceptance/bin/assert_exists.py | 0 acceptance/bin/assert_not_exists.py | 0 acceptance/bundle/deploy/wal/crash-after-create/test.toml | 2 ++ acceptance/bundle/deploy/wal/lineage-mismatch/test.toml | 1 + 4 files changed, 3 insertions(+) mode change 100644 => 100755 acceptance/bin/assert_exists.py mode change 100644 => 100755 acceptance/bin/assert_not_exists.py create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml diff --git a/acceptance/bin/assert_exists.py b/acceptance/bin/assert_exists.py old mode 100644 new mode 100755 diff --git a/acceptance/bin/assert_not_exists.py b/acceptance/bin/assert_not_exists.py old mode 100644 new mode 100755 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml new file mode 100644 index 00000000000..ecd87c31a8b --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -0,0 +1,2 @@ +EnvMatrix.COMMAND = ["plan", "deploy --force-lock", "summary"] +EnvRepl.COMMAND = false diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml new file mode 100644 index 00000000000..0b3a9e0b7cc --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml @@ -0,0 +1 @@ +EnvMatrix.COMMAND = ["deploy", "plan", "summary"] From 2d5eea9302626bbae6b38230267210e45a08433d Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 16:43:20 +0200 Subject: [PATCH 80/80] fix: use TOML basic strings with \n escapes in Repls to avoid CRLF on Windows Multiline TOML basic strings (""") use literal newlines from the file. On Windows with autocrlf=true, these become CRLF. After NormalizeNewlines strips \r from the test output, the replacement re-introduces \r via the New string, causing the comparison to fail. Using single-line basic strings with \n escapes ensures the newlines in the replacement are always LF regardless of platform. Co-authored-by: Denis Bilenko --- acceptance/bundle/deploy/wal/test.toml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 2be1964ae6b..e60e6992455 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -14,9 +14,7 @@ New = '[PROCESS_KILLED]' [[Repls]] Old = '(\n>>> errcode [^\n]+\n)\nExit code:' -New = """${1}[PROCESS_KILLED] - -Exit code:""" +New = "${1}[PROCESS_KILLED]\n\nExit code:" [[Repls]] Old = 'Exit code: 137' @@ -33,7 +31,4 @@ New = '${1}[KILLED]' # Match the raw exit code 1 (Windows never gets 137 or [PROCESS_KILLED] marker first). [[Repls]] Old = '(Deploying resources\.\.\.)\n\nExit code: 1' -New = """${1} -[PROCESS_KILLED] - -Exit code: [KILLED]""" +New = "${1}\n[PROCESS_KILLED]\n\nExit code: [KILLED]"